Beispiel #1
0
def main(hps):

    # Initialize Horovod.
    hvd.init()

    # Create tensorflow session
    sess = tensorflow_session()

    # Download and load dataset.
    tf.set_random_seed(hvd.rank() + hvd.size() * hps.seed)
    np.random.seed(hvd.rank() + hvd.size() * hps.seed)

    # Get data and set train_its and valid_its
    train_iterator, test_iterator, data_init = get_data(hps, sess)
    hps.train_its, hps.test_its, hps.full_test_its = get_its(hps)

    # Create log dir
    logdir = os.path.abspath(hps.logdir) + "/"
    if not os.path.exists(logdir):
        os.mkdir(logdir)

    # Create model
    import model
    model = model.model(sess, hps, train_iterator, test_iterator, data_init)

    # Initialize visualization functions
    visualise = init_visualizations(hps, model, logdir)

    if not hps.inference:
        # Perform training
        train(sess, model, hps, logdir, visualise)
    else:
        infer(sess, model, hps, test_iterator)
def main():
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))
    dataset = dataset.repeat().shuffle(10000).batch(128)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
        tf.keras.layers.Dropout(0.25),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(10, activation='softmax')
    ])
    loss = tf.losses.SparseCategoricalCrossentropy()

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.optimizers.Adam(0.001 * hvd.size())

    checkpoint_dir = './checkpoints'
    checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt)

    @tf.function
    def training_step(images, labels, first_batch):
        with tf.GradientTape() as tape:
            probs = mnist_model(images, training=True)
            loss_value = loss(labels, probs)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, mnist_model.trainable_variables)
        opt.apply_gradients(zip(grads, mnist_model.trainable_variables))

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        #
        # Note: broadcast should be done after the first gradient step to ensure optimizer
        # initialization.
        if first_batch:
            hvd.broadcast_variables(mnist_model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)

        return loss_value

    # Horovod: adjust number of steps based on number of GPUs.
    for batch, (images,
                labels) in enumerate(dataset.take(10000 // hvd.size())):
        loss_value = training_step(images, labels, batch == 0)

        if batch % 10 == 0 and hvd.rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting it.
    if hvd.rank() == 0:
        checkpoint.save(checkpoint_dir)
Beispiel #3
0
def main(unused_argv):
  tf.logging.set_verbosity(tf.logging.INFO)

  hvd.init()

  flags.mark_flag_as_required('model_dir')
  flags.mark_flag_as_required('pipeline_config_path')
  session_config = tf.ConfigProto()
  session_config.gpu_options.per_process_gpu_memory_fraction=0.9
  session_config.gpu_options.visible_device_list = str(hvd.local_rank())
  if FLAGS.amp:
      session_config.graph_options.rewrite_options.auto_mixed_precision = True
  if FLAGS.allow_xla:
      session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
  model_dir = FLAGS.model_dir if hvd.rank() == 0 else None
  config = tf.estimator.RunConfig(model_dir=model_dir, session_config=session_config)

  train_and_eval_dict = model_lib.create_estimator_and_inputs(
      run_config=config,
      eval_count=FLAGS.eval_count,
      hparams=model_hparams.create_hparams(FLAGS.hparams_overrides),
      pipeline_config_path=FLAGS.pipeline_config_path,
      train_steps=FLAGS.num_train_steps,
      sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
      sample_1_of_n_eval_on_train_examples=(
          FLAGS.sample_1_of_n_eval_on_train_examples),
      use_amp=FLAGS.amp,
  )
  estimator = train_and_eval_dict['estimator']
  train_input_fn = train_and_eval_dict['train_input_fn']
  eval_input_fns = train_and_eval_dict['eval_input_fns']
  eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn']
  predict_input_fn = train_and_eval_dict['predict_input_fn']
  train_steps = train_and_eval_dict['train_steps']

  if FLAGS.checkpoint_dir:
    if FLAGS.eval_training_data:
      name = 'training_data'
      input_fn = eval_on_train_input_fn
    else:
      name = 'validation_data'
      # The first eval input will be evaluated.
      input_fn = eval_input_fns[0]
    if FLAGS.run_once:
      estimator.evaluate(input_fn,
                         steps=None,
                         checkpoint_path=tf.train.latest_checkpoint(
                             FLAGS.checkpoint_dir))
    else:
      model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn,
                                train_steps, name)
  else:
    train_spec, eval_specs = model_lib.create_train_and_eval_specs(
        train_input_fn,
        eval_input_fns,
        eval_on_train_input_fn,
        predict_input_fn,
        train_steps,
        eval_on_train_data=False)

    train_hooks = [hvd.BroadcastGlobalVariablesHook(0), DLLoggerHook(hvd.size()*train_and_eval_dict['train_batch_size'], hvd.rank())]
    eval_hooks = []

    for x in range(FLAGS.eval_count):
        estimator.train(train_input_fn,
                        hooks=train_hooks,
                        steps=train_steps // FLAGS.eval_count)


        if hvd.rank() == 0:
            eval_input_fn = eval_input_fns[0]
            results = estimator.evaluate(eval_input_fn,
                               steps=None,
                               hooks=eval_hooks)
Beispiel #4
0
def train_retinaface(cfg):

    # init
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    if cfg['distributed']:
        import horovod.tensorflow as hvd
        # Initialize Horovod
        hvd.init()
    else:
        hvd = []
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'

    reset_random_seeds()

    logger = tf.get_logger()
    logger.disabled = True
    logger.setLevel(logging.FATAL)
    set_memory_growth(hvd)

    # define network
    model = RetinaFaceModel(cfg, training=True)
    model.summary(line_length=80)

    # define prior box
    priors = prior_box((cfg['input_size'], cfg['input_size']),
                       cfg['min_sizes'],  cfg['steps'], cfg['clip'])

    # load dataset
    train_dataset = load_dataset(cfg, priors, 'train', hvd)
    if cfg['evaluation_during_training']:
        val_dataset = load_dataset(cfg, priors, 'val', [])

    # define optimizer
    if cfg['distributed']:
        init_lr = cfg['init_lr'] * hvd.size()
        min_lr = cfg['min_lr'] * hvd.size()
        steps_per_epoch = cfg['dataset_len'] // (cfg['batch_size'] * hvd.size())
    else:
        init_lr = cfg['init_lr']
        min_lr = cfg['min_lr']
        steps_per_epoch = cfg['dataset_len'] // cfg['batch_size']

    learning_rate = MultiStepWarmUpLR(
        initial_learning_rate=init_lr,
        lr_steps=[e * steps_per_epoch for e in cfg['lr_decay_epoch']],
        lr_rate=cfg['lr_rate'],
        warmup_steps=cfg['warmup_epoch'] * steps_per_epoch,
        min_lr=min_lr)

    optimizer = tf.keras.optimizers.SGD(
        learning_rate=learning_rate, momentum=0.9, nesterov=True)

    # define losses function
    multi_box_loss = MultiBoxLoss(num_class=cfg['num_class'])

    # load checkpoint
    checkpoint_dir = os.path.join(cfg['output_path'], 'checkpoints', cfg['sub_name'])
    checkpoint = tf.train.Checkpoint(epoch=tf.Variable(0, name='epoch'),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(checkpoint=checkpoint,
                                         directory=checkpoint_dir,
                                         max_to_keep=3)

    os.makedirs(checkpoint_dir, exist_ok=True)
    with open(os.path.join(checkpoint_dir, 'cfg.pickle'), 'wb') as handle:
        pickle.dump(cfg, handle, protocol=pickle.HIGHEST_PROTOCOL)

    if manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        print('[*] load ckpt from {}'.format(manager.latest_checkpoint))
    else:
        print("[*] training from scratch.")

    # define training step function
    @tf.function
    def train_step(inputs, labels, first_batch, epoch):
        with tf.GradientTape() as tape:
            predictions = model(inputs, training=True)

            losses = {}
            losses['reg'] = tf.reduce_sum(model.losses)
            losses['loc'], losses['landm'], losses['class'] = \
                multi_box_loss(labels, predictions)
            total_loss = tf.add_n([l for l in losses.values()])

        if cfg['distributed']:
            # Horovod: add Horovod Distributed GradientTape.
            tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if cfg['distributed'] and first_batch and epoch:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        return total_loss, losses

    def test_step(inputs, img_name):
        _, img_height_raw, img_width_raw, _ = inputs.shape
        # pad input image to avoid unmatched shape problem
        img = inputs[0].numpy()
        # if img_name == '6_Funeral_Funeral_6_618':
        #     resize = 0.5 # this image is too big to avoid OOM problem
        #     img = cv2.resize(img, None, None, fx=resize, fy=resize,
        #                      interpolation=cv2.INTER_LINEAR)
        img, pad_params = pad_input_image(img, max_steps=max(cfg['steps']))
        input_img = img[np.newaxis, ...]
        predictions = model(input_img, training=False)
        outputs = pred_to_outputs(cfg, predictions, input_img.shape).numpy()
        # recover padding effect
        outputs = recover_pad_output(outputs, pad_params)

        bboxs = outputs[:, :4]
        confs = outputs[:, -1]
        pred_boxes = []
        for box, conf in zip(bboxs, confs):
            x = int(box[0] * img_width_raw)
            y = int(box[1] * img_height_raw)
            w = int(box[2] * img_width_raw) - int(box[0] * img_width_raw)
            h = int(box[3] * img_height_raw) - int(box[1] * img_height_raw)
            pred_boxes.append([x, y, w, h, conf])

        pred_boxes = np.array(pred_boxes).astype('float')

        return pred_boxes

    #training loop
    summary_writer = tf.summary.create_file_writer(os.path.join(cfg['output_path'], 'logs', cfg['sub_name']))
    prog_bar = ProgressBar(steps_per_epoch, 0)

    if cfg['evaluation_during_training']:
        widerface_eval_hard = WiderFaceEval(split='hard')

    for epoch in range(cfg['epoch']):
        try:
            actual_epoch = epoch + 1

            if cfg['distributed']:
                if hvd.rank() == 0:
                    print("\nStart of epoch %d" % (actual_epoch,))
            else:
                print("\nStart of epoch %d" % (actual_epoch,))

            checkpoint.epoch.assign_add(1)
            start_time = time.time()

            #Iterate over the batches of the dataset.
            for batch, (x_batch_train, y_batch_train, img_name) in enumerate(train_dataset):
                total_loss, losses = train_step(x_batch_train, y_batch_train, batch == 0, epoch == 0)

                if cfg['distributed']:
                    if hvd.rank() == 0:
                        # prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
                        #     checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32)))
                        if batch % 100 == 0:
                            print("batch={}/{},  epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
                                batch, steps_per_epoch, checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32)))
                else:
                    prog_bar.update("epoch={}/{}, loss={:.4f}, lr={:.1e}".format(
                        checkpoint.epoch.numpy(), cfg['epoch'], total_loss.numpy(), optimizer._decayed_lr(tf.float32)))

            # Display metrics at the end of each epoch.
            # train_acc = train_acc_metric.result()
            # print("\nTraining loss over epoch: %.4f" % (float(total_loss.numpy()),))

            if cfg['distributed']:
                if hvd.rank() == 0:
                    print("Time taken: %.2fs" % (time.time() - start_time))
                    manager.save()
                    print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint))
            else:
                print("Time taken: %.2fs" % (time.time() - start_time))
                manager.save()
                print("\n[*] save ckpt file at {}".format(manager.latest_checkpoint))

            if cfg['evaluation_during_training']:
                # Run a validation loop at the end of each epoch.
                for batch, (x_batch_val, y_batch_val, img_name) in enumerate(val_dataset.take(500)):
                    if '/' in img_name.numpy()[0].decode():
                        img_name = img_name.numpy()[0].decode().split('/')[1].split('.')[0]
                    else:
                        img_name = []
                    pred_boxes = test_step(x_batch_val, img_name)
                    gt_boxes = labels_to_boxes(y_batch_val)
                    widerface_eval_hard.update(pred_boxes, gt_boxes, img_name)

                ap_hard = widerface_eval_hard.calculate_ap()
                widerface_eval_hard.reset()

                if cfg['distributed']:
                    if hvd.rank() == 0:
                        print("Validation acc: %.4f" % (float(ap_hard),))
                else:
                    print("Validation acc: %.4f" % (float(ap_hard),))

            def tensorboard_writer():
                with summary_writer.as_default():
                    tf.summary.scalar('loss/total_loss', total_loss, step=actual_epoch)
                    for k, l in losses.items():
                        tf.summary.scalar('loss/{}'.format(k), l, step=actual_epoch)
                    tf.summary.scalar('learning_rate', optimizer._decayed_lr(tf.float32), step=actual_epoch)
                    if cfg['evaluation_during_training']:
                        tf.summary.scalar('Val AP', ap_hard, step=actual_epoch)

            if cfg['distributed']:
                if hvd.rank() == 0:
                    tensorboard_writer()
            else:
                tensorboard_writer()

        except Exception as E:
            print(E)
            continue

    if cfg['distributed']:
        if hvd.rank() == 0:
            manager.save()
            print("\n[*] training done! save ckpt file at {}".format(
                manager.latest_checkpoint))
    else:
        manager.save()
        print("\n[*] training done! save ckpt file at {}".format(
            manager.latest_checkpoint))
Beispiel #5
0
def train(action_set, level_names):
    """Train."""

    local_job_device = '/job:%s/task:%d' % (FLAGS.job_name, FLAGS.task)
    shared_job_device = '/job:learner/task:0'
    is_actor_fn = lambda i: FLAGS.job_name == 'actor' and i == FLAGS.task
    is_learner = FLAGS.job_name == 'learner'
    actor_hosts = FLAGS.actor_hosts.split(',')
    num_actors = len(actor_hosts)
    learner_host = FLAGS.learner_host.split(',')
    assert (len(learner_host) == 1)
    if is_learner:
        assert (FLAGS.task == 0)
        assert (has_horovod == True)
        hvd.init()

    # Placing the variable on CPU, makes it cheaper to send it to all the
    # actors. Continual copying the variables from the GPU is slow.
    global_variable_device = '/job:learner/task:0' + '/cpu'
    filters = [shared_job_device, local_job_device]
    cluster = tf.train.ClusterSpec({
        'actor': actor_hosts,
        'learner': learner_host
    })
    config = tf.ConfigProto(allow_soft_placement=True, device_filters=filters)
    if is_learner:
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task,
                             config=config)

    # Only used to find the actor output structure.
    Agent = agent_factory(FLAGS.agent_name)
    with tf.Graph().as_default():
        agent = Agent(len(action_set))
        env = create_environment(level_names[0], seed=1)
        structure = build_actor(agent, env, level_names[0], action_set)
        flattened_structure = nest.flatten(structure)
        dtypes = [t.dtype for t in flattened_structure]
        shapes = [t.shape.as_list() for t in flattened_structure]

    # build graph for actor or learner
    with tf.Graph().as_default(), \
         tf.device(local_job_device + '/cpu'), \
         pin_global_variables(global_variable_device):
        tf.set_random_seed(FLAGS.seed)  # Makes initialization deterministic.

        # Create Queue and Agent on the learner.
        with tf.device(shared_job_device):
            queue = tf.FIFOQueue(FLAGS.queue_capacity,
                                 dtypes,
                                 shapes,
                                 shared_name='buffer')
            agent = Agent(len(action_set))

        # Build actors and ops to enqueue their output.
        enqueue_ops = []
        for i in range(num_actors):
            if is_actor_fn(i):
                level_name = level_names[i % len(level_names)]
                tf.logging.info('Creating actor %d with level %s', i,
                                level_name)
                env = create_environment(level_name, seed=i + 1)
                actor_output = build_actor(agent, env, level_name, action_set)
                with tf.device(shared_job_device):
                    enqueue_ops.append(
                        queue.enqueue(nest.flatten(actor_output)))

        # Build learner.
        if is_learner:
            # Create global step, which is the number of environment frames
            # processed.
            g_step = tf.get_variable('num_environment_frames',
                                     initializer=tf.zeros_initializer(),
                                     shape=[],
                                     dtype=tf.int64,
                                     trainable=False,
                                     collections=[
                                         tf.GraphKeys.GLOBAL_STEP,
                                         tf.GraphKeys.GLOBAL_VARIABLES
                                     ])
            # Create batch (time major) and recreate structure.
            dequeued = queue.dequeue_many(FLAGS.batch_size)
            dequeued = nest.pack_sequence_as(structure, dequeued)

            def make_time_major(s):
                return nest.map_structure(
                    lambda t: tf.transpose(t, [1, 0] + list(
                        range(t.shape.ndims))[2:]), s)

            dequeued = dequeued._replace(
                env_outputs=make_time_major(dequeued.env_outputs),
                agent_outputs=make_time_major(dequeued.agent_outputs))

            with tf.device("/gpu"):
                # Using StagingArea allows us to prepare the next batch and send it to
                # the GPU while we're performing a training step. This adds up to 1
                # step policy lag.
                flattened_output = nest.flatten(dequeued)
                area = tf.contrib.staging.StagingArea(
                    [t.dtype for t in flattened_output],
                    [t.shape for t in flattened_output])
                stage_op = area.put(flattened_output)
                data_from_actors = nest.pack_sequence_as(structure, area.get())
                # Unroll agent on sequence, create losses and update ops.
                if hasattr(data_from_actors, 'agent_state'):
                    agent_state = data_from_actors.agent_state
                else:
                    agent_state = agent.initial_state(1)
                output, optimizer = build_learner(
                    agent,
                    agent_state=agent_state,
                    env_outputs=data_from_actors.env_outputs,
                    agent_outputs=data_from_actors.agent_outputs,
                    g_step=g_step)

        # Create MonitoredSession (to run the graph, checkpoint and log).
        is_chief = is_learner  # MonitoredTrainingSession inits all global variables
        hooks = [py_process.PyProcessHook()]
        if is_learner:
            # for variable initialization across learners
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))
        tf.logging.info('Creating MonitoredSession, is_chief %s', is_chief)
        if is_learner:
            tf.logging.info('At rank %d', hvd.rank())
        # rank 0 takes care of ckpt saving
        checkpoint_dir = FLAGS.logdir if is_learner and hvd.rank(
        ) == 0 else None
        with tf.train.MonitoredTrainingSession(server.target,
                                               is_chief=is_chief,
                                               checkpoint_dir=checkpoint_dir,
                                               save_checkpoint_secs=600,
                                               save_summaries_secs=30,
                                               log_step_count_steps=50000,
                                               config=config,
                                               hooks=hooks) as session:

            if is_learner:
                # tb Logging
                summary_writer = (tf.summary.FileWriterCache.get(FLAGS.logdir)
                                  if hvd.rank() == 0 else None)

                # Prepare data for first run.
                session.run_step_fn(
                    lambda step_context: step_context.session.run(stage_op))

                # Execute learning and track performance.
                num_env_frames_v = 0
                while num_env_frames_v < FLAGS.total_environment_frames:
                    level_names_v, done_v, infos_v, num_env_frames_v, _ = session.run(
                        (data_from_actors.level_name, ) + output +
                        (stage_op, ))
                    level_names_v = np.repeat([level_names_v], done_v.shape[0],
                                              0)

                    for (level_name, episode_return, episode_step,
                         episode_raw_return, episode_raw_step) in zip(
                             level_names_v[done_v],
                             infos_v.episode_return[done_v],
                             infos_v.episode_step[done_v],
                             infos_v.episode_raw_return[done_v],
                             infos_v.episode_raw_step[done_v]):
                        episode_frames = episode_step

                        tf.logging.info(
                            'learner rank: %d, Env: %s Episode return: %f '
                            'Episode raw return: %f', hvd.rank(), level_name,
                            episode_return, episode_raw_return)

                        if hvd.rank() == 0:  # tb Logging
                            summary = tf.summary.Summary()
                            summary.value.add(tag=level_name +
                                              '/episode_return',
                                              simple_value=episode_return)
                            summary.value.add(tag=level_name +
                                              '/episode_frames',
                                              simple_value=episode_frames)
                            summary.value.add(tag=level_name +
                                              '/episode_raw_return',
                                              simple_value=episode_raw_return)
                            summary.value.add(tag=level_name +
                                              '/episode_raw_frames',
                                              simple_value=episode_raw_step)
                            summary_writer.add_summary(summary,
                                                       num_env_frames_v)
            else:
                # Execute actors (they just need to enqueue their output).
                while True:
                    session.run(enqueue_ops)
Beispiel #6
0
def main():
    gpu_thread_count = 2
    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
    hvd.init()

    # random.seed(5 * (1 + hvd.rank()))
    # np.random.seed(7 * (1 + hvd.rank()))
    # tf.set_random_seed(31 * (1 + hvd.rank()))

    cmdline = add_cli_args()
    FLAGS, unknown_args = cmdline.parse_known_args()
    if len(unknown_args) > 0:
        for bad_arg in unknown_args:
            print("ERROR: Unknown command line arg: %s" % bad_arg)
        raise ValueError("Invalid command line arg(s)")

    FLAGS.data_dir = None if FLAGS.data_dir == "" else FLAGS.data_dir
    FLAGS.log_dir = None if FLAGS.log_dir == "" else FLAGS.log_dir  #+ FLAGS.log_dir_suffix
    filename_pattern = os.path.join(FLAGS.data_dir, '%s-*')
    train_filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
    eval_filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
    num_training_samples = get_num_records(train_filenames)
    training_samples_per_rank = num_training_samples // hvd.size()
    height, width = 224, 224
    global_batch_size = FLAGS.batch_size * hvd.size()

    if FLAGS.num_epochs is not None:
        if FLAGS.data_dir is None:
            raise ValueError("num_epochs requires --data_dir to be specified")
        nstep = num_training_samples * FLAGS.num_epochs // global_batch_size
        decay_steps = nstep
    else:
        nstep = FLAGS.num_batches
        FLAGS.num_epochs = max(
            nstep * global_batch_size // num_training_samples, 1)
        decay_steps = 90 * num_training_samples // global_batch_size

    nstep_per_epoch = num_training_samples // global_batch_size
    if FLAGS.lr_decay_mode == 'steps':
        steps = [
            int(x) * nstep_per_epoch for x in FLAGS.lr_decay_steps.split(',')
        ]
        lr_steps = [FLAGS.lr]
        for i in range(len(FLAGS.lr_decay_steps.split(','))):
            lr_steps.append(FLAGS.lr * pow(FLAGS.lr_decay_factor, i + 1))
    else:
        steps = []
        lr_steps = []

    if not FLAGS.save_checkpoints_steps:
        # default to save one checkpoint per epoch
        FLAGS.save_checkpoints_steps = nstep_per_epoch
    if not FLAGS.save_summary_steps:
        # default to save one checkpoint per epoch
        FLAGS.save_summary_steps = nstep_per_epoch

    warmup_it = nstep_per_epoch * FLAGS.warmup_epochs

    if not FLAGS.log_name:
        FLAGS.log_name = "aws_tf_resnet"

    if FLAGS.eval:
        FLAGS.log_name = 'eval' + FLAGS.log_name

    logger = logging.getLogger(FLAGS.log_name)
    logger.setLevel(logging.INFO)  # INFO, ERROR
    # file handler which logs debug messages
    if not os.path.isdir(FLAGS.log_dir):
        try:
            os.makedirs(FLAGS.log_dir)
        except FileExistsError:
            # if log_dir is common for multiple ranks like on nfs
            pass

    fh = logging.FileHandler(os.path.join(FLAGS.log_dir, FLAGS.log_name))
    fh.setLevel(logging.DEBUG)
    # console handler
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    # add formatter to the handlers
    # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    formatter = logging.Formatter('%(message)s')
    fh.setFormatter(formatter)
    ch.setFormatter(formatter)
    # add handlers to logger
    logger.addHandler(fh)
    logger.addHandler(ch)

    rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True  # Force pinned memory
    config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 5
    rank0log(logger, "Horovod size: ", hvd.size())

    if FLAGS.local_ckpt:
        do_checkpoint = hvd.local_rank() == 0
    else:
        do_checkpoint = hvd.rank() == 0
    classifier = tf.estimator.Estimator(
        model_fn=cnn_model_function,
        model_dir=FLAGS.log_dir,
        params={
            'model':
            FLAGS.model,
            'decay_steps':
            decay_steps,
            'n_classes':
            1000,
            'dtype':
            tf.float16 if FLAGS.fp16 else tf.float32,
            'format':
            'channels_first',
            'device':
            '/gpu:0',
            'lr':
            FLAGS.lr,
            'mom':
            FLAGS.mom,
            'wdecay':
            FLAGS.wdecay,
            'steps':
            steps,
            'lr_steps':
            lr_steps,
            'lr_decay_mode':
            FLAGS.lr_decay_mode,
            'warmup_it':
            warmup_it,
            'warmup_lr':
            FLAGS.warmup_lr,
            'loss_scale':
            FLAGS.loss_scale,
            'adv_bn_init':
            FLAGS.adv_bn_init,
            'conv_init':
            tf.variance_scaling_initializer() if FLAGS.adv_conv_init else None,
        },
        config=tf.estimator.RunConfig(
            # tf_random_seed=31 * (1 + hvd.rank()),
            session_config=config,
            save_summary_steps=FLAGS.save_summary_steps
            if do_checkpoint else None,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps
            if do_checkpoint else None,
            keep_checkpoint_max=None))

    if not FLAGS.eval:
        num_preproc_threads = 5
        rank0log(logger, "Preproc threads", num_preproc_threads)
        training_hooks = [
            hvd.BroadcastGlobalVariablesHook(0),
            PrefillStagingAreasHook()
        ]
        if hvd.rank() == 0:
            training_hooks.append(
                LogSessionRunHook(global_batch_size, num_training_samples,
                                  FLAGS.display_every, logger))
        try:
            start_time = time.time()
            classifier.train(
                input_fn=lambda: make_dataset(train_filenames,
                                              training_samples_per_rank,
                                              FLAGS.batch_size,
                                              height,
                                              width,
                                              training=True,
                                              num_threads=num_preproc_threads,
                                              shard=True),
                max_steps=nstep,
                hooks=training_hooks)
            rank0log(logger, "Finished in ", time.time() - start_time)
        except KeyboardInterrupt:
            print("Keyboard interrupt")

    if True:
        rank0log(logger, "Evaluating")
        rank0log(
            logger, "Validation dataset size: {}".format(
                get_num_records(eval_filenames)))
        barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
        tf.Session(config=config).run(barrier)
        time.sleep(5)  # a little extra margin...
        if not FLAGS.eval:
            FLAGS.num_gpus = hvd.size()
        if FLAGS.num_gpus == 1:
            rank0log(
                logger,
                """If you are evaluating checkpoints of a multi-GPU run on a single GPU,
             ensure you set --num_gpus to the number of GPUs it was trained on.
             This will ensure that the epoch number is accurately displayed in the below logs."""
            )
        try:
            ckpts = sort_and_load_ckpts(FLAGS.log_dir)
            for i, c in enumerate(ckpts):
                if i < len(ckpts) - 1:
                    if (not FLAGS.eval_interval) or \
                            (i % FLAGS.eval_interval != 0):
                        continue
                eval_result = classifier.evaluate(
                    input_fn=lambda: make_dataset(eval_filenames,
                                                  get_num_records(
                                                      eval_filenames),
                                                  FLAGS.batch_size,
                                                  height,
                                                  width,
                                                  training=False,
                                                  shard=True),
                    checkpoint_path=c['path'])
                c['epoch'] = (c['step'] * FLAGS.num_gpus) / (nstep_per_epoch *
                                                             hvd.size())
                c['top1'] = eval_result['val-top1acc']
                c['top5'] = eval_result['val-top5acc']
                c['loss'] = eval_result['loss']
            rank0log(logger, ' step  epoch  top1    top5     loss   time(h)')
            barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
            for i, c in enumerate(ckpts):
                tf.Session(config=config).run(barrier)
                if 'top1' not in c:
                    continue
                rank0log(
                    logger,
                    '{:5d}  {:5.1f}  {:5.3f}  {:6.2f}  {:6.2f}  {:10.3f}'.
                    format(c['step'], c['epoch'], c['top1'] * 100,
                           c['top5'] * 100, c['loss'], c['mtime']))
            rank0log(logger, "Finished evaluation")
        except KeyboardInterrupt:
            logger.error("Keyboard interrupt")
Beispiel #7
0
def parse_args():
    global DEBUG
    global MODE
    global INFERENCE_ONLY
    global INFERENCE_OUT
    global DATA_THREADS
    global RANK
    global RANKS
    global DTYPE
    #global hvd
    global DISTRIBUTED
    global BATCH_SIZE
    global BATCH_SIZE_TEST
    # Parse command line args.
    parser = argparse.ArgumentParser(prog='ML-Dock-GN using Tensorflow + graph_nets Backend', description='Processing input flags for Training Run.')
    parser.add_argument('--batch_size', type=int, default=4, help='The (local) minibatch size used in training.')
    parser.add_argument('--batch_size_test', type=int, default=8, help='The (local) minibatch size used in testing.')
    parser.add_argument('--map_train', type=str, required=True, help='Path to .map file for training set.')
    parser.add_argument('--map_test', type=str, required=True, help='Path to .map file for test set.')
    parser.add_argument('--epochs', type=int, default=100, help='Number of epochs to train.')
    parser.add_argument('--mlp_layers', type=str, default="4,4", help='Number of layers in each MLP.')
    parser.add_argument('--mlp_latent', type=str, default="32,16", help='Number of neurons in each MLP layer.')
    parser.add_argument('--num_features', type=str, default="64,64", help='Number of output protein features, ligand features.')
    parser.add_argument('--gnn_layers', type=str, default="4,8", help='Number of message passing steps.')
    parser.add_argument('--lr_init', type=float, default=0.01, help='Initial learning rate.')
    parser.add_argument('--hvd', type=bool, default=False, help='Enable the use of Horovod.')
    parser.add_argument('--debug', type=bool, default=True, help='Enable debug tests / prints.')
    parser.add_argument('--use_clr', type=bool, default=False, help='Use Cyclic Learning Rate if true else constant.')
    parser.add_argument('--inference_only', type=bool, default=False, help='Ignore training, only use test set.')
    parser.add_argument('--inference_out', type=str, default=None, help='Write test set predictions to file.')
    parser.add_argument('--data_threads', type=int, default=1, help='Number of data loading threads.')
    parser.add_argument('--mode', type=str, default="regression", help='Training mode: "regression" or "classification".')
    parser.add_argument('--restore', type=str, default=None, help='Path to checkpoint file.')
    parser.add_argument('--plot_history', type=bool, default=False, help='Save training/testing history images')
    parser.add_argument('--use_fp16', type=bool, default=False, help='Use half-precision (tf.float16)')
    args = parser.parse_args()
    DEBUG = args.debug
    MODE = args.mode
    INFERENCE_ONLY = args.inference_only
    INFERENCE_OUT = args.inference_out
    DATA_THREADS = args.data_threads
    DTYPE = tf.float16 if args.use_fp16 else tf.float32
    BATCH_SIZE=args.batch_size
    BATCH_SIZE_TEST=args.batch_size_test
    print(args)
    if args.hvd:
        print("Starting horovod...")
        import horovod.tensorflow as hvd
        #hvd = hvd_temp
        hvd.init()
        RANK = hvd.rank()
        RANKS = hvd.size()
        DISTRIBUTED=True
        print("Initialization of horovod complete...")
        #Index the output filenames for inference output data by rank ID
        if(args.inference_out != None):
            INFERENCE_OUT = str(args.inference_out).split(".")[0] + "_%s.map"%str(RANK)
            print("Rank %s"%str(RANK), " is saving inference output to %s"%str(INFERENCE_OUT))

    if RANK != 0:
        #Only rank 0 should print debug info
        DEBUG = False
        #Reduce logging for all ranks other than 0
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    #
    banner_print("PharML.Bind-GNN: Version 1.0.1 - Framework for Open Therapeutics with Graph Neural Networks.")
    banner_print("============================================================================================")
    banner_print("  Developed by")
    banner_print("      Jacob Balma: [email protected]")
    banner_print("      Aaron Vose:  [email protected]")
    banner_print("      Yuri Petersen: [email protected]")
    banner_print("This work is supported by collaboration with Cray, Inc, Medical University of South Carolina (MUSC) and Hewlett Packard Enterprise (HPE). ")
    banner_print("============================================================================================")
    if DEBUG:
        print(args)
    # Return parsed args.
    return args
Beispiel #8
0
def main(_):

    # Horovod: initialize Horovod.
    hvd.init()
    hvd_size = hvd.size()
    print("hvd size: {}".format(hvd_size))

    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir',
                        type=str,
                        default='/tensorflow/mnist/',
                        help='Directory which contains dataset')
    parser.add_argument('--steps', type=int, default=100, help='steps')

    FLAGS, _ = parser.parse_known_args()

    # Ensure data directory passed to the script contains proper dataset
    dir_content = os.listdir(FLAGS.data_dir)
    for file in FILENAMES:
        if file not in dir_content:
            print(
                "Directory provided by user does not contains proper dataset")
            FLAGS.data_dir = os.path.join(FLAGS.data_dir,
                                          "input_data_{}".format(hvd.rank()))
            break

    # Read/download local dataset. Different copy for each process.
    mnist = learn.datasets.mnist.read_data_sets(FLAGS.data_dir)

    # Name images placeholder to be able to retrieve it from saved meta graph.
    images_placeholder = tf.placeholder(tf.float32, [None, 784],
                                        name=INPUT_NAME)

    dense_dropout_placeholder = tf.placeholder_with_default(1.0, [])
    labels_placeholder = tf.placeholder(tf.int64, [None])
    logits, scores, predictions = build_net(images_placeholder,
                                            dense_dropout_placeholder)

    # Exporting meta graph right now takes care of removing Horovod specific ops before serving. Graph right now
    # also does not contain any training specific ops, so it is optimized for serving too.
    tf.train.export_meta_graph("graph.meta", as_text=True)

    loss = tf.losses.softmax_cross_entropy(tf.one_hot(labels_placeholder, 10),
                                           logits)
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(predictions, labels_placeholder), tf.float32))

    # Define summary ops to save summaries for later use in tensorboard.
    tf.summary.scalar("accuracy", accuracy)
    tf.summary.scalar("loss", loss)
    summary_op = tf.summary.merge_all()

    # Horovod: adjust learning rate based on number of workers.
    optimizer = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    global_step = tf.contrib.framework.get_or_create_global_step()

    # Wrap standard optimizer in Horovod distributed one.
    train = hvd.DistributedOptimizer(optimizer).minimize(
        loss, global_step=global_step)

    hooks = [
        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
        # from rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights
        # or restored from a checkpoint.
        hvd.BroadcastGlobalVariablesHook(0),

        # Horovod: adjust number of steps based on number of workers.
        tf.train.StopAtStepHook(FLAGS.steps // hvd_size),
        tf.train.LoggingTensorHook(tensors={
            'step': global_step,
            'loss': loss
        },
                                   every_n_iter=10),
    ]

    # Only master saves summaries.
    if hvd.rank() == 0:
        hooks += [
            # As previously mentioned summaries are saved to EXPERIMENT_OUTPUT_PATH so that they can be discovered by
            # tensorboard.
            tf.train.SummarySaverHook(save_steps=1,
                                      output_dir=os.path.join(
                                          EXPERIMENT_OUTPUT_PATH,
                                          "tensorboard"),
                                      summary_op=summary_op)
        ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. As previously mentioned
    # checkpoints are saved to EXPERIMNET_OUTPUT_PATH which makes them accessible by user.
    checkpoint_dir = os.path.join(EXPERIMENT_OUTPUT_PATH,
                                  "checkpoints") if hvd.rank() == 0 else None

    # The MonitoredTrainingSession takes care of session initialization,
    # restoring from a checkpoint, saving to a checkpoint, and closing when done
    # or an error occurs.
    with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                           hooks=hooks) as mon_sess:
        while not mon_sess.should_stop():
            images, labels = mnist.train.next_batch(64)
            _, loss_val, accuracy_val, global_step_val = mon_sess.run(
                [train, loss, accuracy, global_step],
                feed_dict={
                    images_placeholder: images,
                    labels_placeholder: labels,
                    dense_dropout_placeholder: 0.5
                })

            # Only master publishes metrics.
            if hvd.rank() == 0:
                # Publish metrics just like in the single node example.
                publish({
                    "loss": str(loss_val),
                    "accuracy": str(accuracy_val),
                    "global_step": str(global_step_val)
                })

    # Save servable model only from Horovod master.
    if hvd.rank() == 0:
        # Create a new graph to import the previously exported one.
        with tf.Graph().as_default():
            # Import previously saved meta graph.
            restorer = tf.train.import_meta_graph("graph.meta")
            with tf.Session() as session:
                checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
                restorer.restore(session, checkpoint_file)

                # Get handlers for images placeholder and scores op with names defined before.
                images_placeholder = tf.get_default_graph().get_tensor_by_name(
                    INPUT_NAME + ":0")
                scores = tf.get_default_graph().get_tensor_by_name(
                    SCORES_NAME + ":0")

                # Save servable model to EXPERIMENT_OUTPUT_PATH to make it accessible to the user.
                builder = tf.saved_model.builder.SavedModelBuilder(
                    os.path.join(EXPERIMENT_OUTPUT_PATH, "1"))

                prediction_signature = (
                    tf.saved_model.signature_def_utils.build_signature_def(
                        inputs={
                            MODEL_INPUT_NAME:
                            tf.saved_model.utils.build_tensor_info(
                                images_placeholder)
                        },
                        outputs={
                            MODEL_OUTPUT_NAME:
                            tf.saved_model.utils.build_tensor_info(scores)
                        },
                        method_name=tf.saved_model.signature_constants.
                        PREDICT_METHOD_NAME))

                builder.add_meta_graph_and_variables(
                    session, [tf.saved_model.tag_constants.SERVING],
                    signature_def_map={
                        MODEL_SIGNATURE_NAME: prediction_signature
                    },
                    main_op=tf.tables_initializer(),
                    strip_default_attrs=True)

                builder.save()
def main():
    hvd.init()

    n_epochs = 10
    batch_size = 5
    step = len(im) // batch_size

    params = parse_args(PARSER.parse_args())

    optimizer = tf.keras.optimizers.Adam(learning_rate=params.learning_rate)
    ce_loss = tf.keras.metrics.Mean(name='ce_loss')
    f1_loss = tf.keras.metrics.Mean(name='dice_loss')
    checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

    pb_i = Progbar(step, stateful_metrics=metrics_names)
    count = 0
    for epoch in range(n_epochs):

        if count >= step:
            count = 0

        features = im[epoch * batch_size:(epoch * batch_size) + batch_size]
        features = np.reshape(features,
                              (len(features), features[0].shape[1],
                               features[0].shape[2], features[0].shape[0]))
        features = features.astype('float32')

        labels = lb[epoch * batch_size:(epoch * batch_size) + batch_size]
        labels = np.reshape(
            labels, (len(labels), labels[0].shape[0], labels[0].shape[1], 1))
        labels = labels.astype('float32')
        print(features.shape, labels.shape)

        print('Epoch {} out of epochs {}'.format(epoch, n_epochs))

        for i, (features_, labels_) in enumerate(zip(features, labels)):

            with tf.GradientTape() as tape:

                output_map = model(features)

                crossentropy_loss, dice_loss = partial_losses(
                    output_map, labels)
                added_losses = tf.add(crossentropy_loss,
                                      dice_loss,
                                      name='total_loss_ref')

                values = [('Xent', crossentropy_loss),
                          ('added_losses', added_losses)]

                pb_i.add(1, values=values)

            # calculate the gradients using our tape and then update the
        # model weights
            tape = hvd.DistributedGradientTape(tape)
            gradients = tape.gradient(added_losses, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients,
                                          model.trainable_variables))

            # Calculate something wrong here
            # val_total_loss = 0
            # val_total_acc = 0
            # total_val_num = 0
            # for bIdx, (val_X, val_y) in enumerate(val_batch):
            #     if bIdx >= features.shape[0]:
            #         break
            #     y_pred = model(val_X, training=False)

        print('Xen: ', crossentropy_loss, dice_loss, added_losses)
Beispiel #10
0
    def __init__(
            self,

            # Model Params
            input_format,  # NCHW or NHWC
            compute_format,  # NCHW or NHWC
            n_channels,
            activation_fn,
            weight_init_method,
            model_variant,
            input_shape,
            mask_shape,
            input_normalization_method,

            # Training HParams
            augment_data,
            loss_fn_name,

            #  Runtime HParams
            use_tf_amp,
            use_xla,

            # Directory Params
            model_dir=None,
            log_dir=None,
            sample_dir=None,
            data_dir=None,
            dataset_name=None,
            dataset_hparams=None,

            # Debug Params
            log_every_n_steps=1,
            debug_verbosity=0,
            seed=None):

        if dataset_hparams is None:
            dataset_hparams = dict()

        if compute_format not in ["NHWC", 'NCHW']:
            raise ValueError(
                "Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])"
                % compute_format)

        if input_format not in ["NHWC", 'NCHW']:
            raise ValueError(
                "Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])"
                % input_format)

        if n_channels not in [1, 3]:
            raise ValueError(
                "Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))"
                % n_channels)

        if data_dir is not None and not os.path.exists(data_dir):
            raise ValueError("The `data_dir` received does not exists: %s" %
                             data_dir)

        LOGGER.set_model_name('UNet_TF')

        LOGGER.set_backends([
            dllg.JsonBackend(log_file=os.path.join(model_dir,
                                                   'dlloger_out.json'),
                             logging_scope=dllg.Scope.TRAIN_ITER,
                             iteration_interval=log_every_n_steps),
            dllg.StdOutBackend(log_file=None,
                               logging_scope=dllg.Scope.TRAIN_ITER,
                               iteration_interval=log_every_n_steps)
        ])

        if hvd_utils.is_using_hvd():
            hvd.init()

            if hvd.rank() != 0:
                os.environ['WANDB_MODE'] = 'dryrun'
            wandb_id = os.environ.get('WANDB_ID', None)
            if wandb_id is None:
                wandb.init(config={
                    'SLURM_JOB_ID': os.environ.get('SLURM_JOB_ID', None)
                })
            else:
                wandb.init(config={
                    'SLURM_JOB_ID': os.environ.get('SLURM_JOB_ID', None)
                },
                           id=f"{wandb_id}{hvd.rank()}")
                wandb.tensorboard.patch(save=False)

            if hvd.local_rank() == 0:
                LOGGER.log("Horovod successfully initialized ...")

            tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None

        else:
            tf_seed = 2 * seed if seed is not None else None

        # ============================================
        # Optimisation Flags - Do not remove
        # ============================================

        os.environ['CUDA_CACHE_DISABLE'] = '0'

        os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'

        # os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
        os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd(
        ) else str(hvd.size())

        os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

        os.environ['TF_ADJUST_HUE_FUSED'] = '1'
        os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

        os.environ['TF_SYNC_ON_FINISH'] = '0'
        os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
        # os.environ['TF_DISABLE_NVTX_RANGES'] = '1'

        # =================================================

        self.use_xla = use_xla

        if use_tf_amp:

            if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
                LOGGER.log("TF AMP is activated - Experimental Feature")

            os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

        # =================================================

        model_hparams = tf.contrib.training.HParams(
            # Model Params
            input_format=input_format,
            compute_format=compute_format,
            input_shape=input_shape,
            mask_shape=mask_shape,
            n_channels=n_channels,
            activation_fn=activation_fn,
            weight_init_method=weight_init_method,
            model_variant=model_variant,
            input_normalization_method=input_normalization_method,

            # Training HParams
            augment_data=augment_data,
            loss_fn_name=loss_fn_name,

            # Runtime Params
            use_tf_amp=use_tf_amp,

            # Debug Params
            log_every_n_steps=log_every_n_steps,
            debug_verbosity=debug_verbosity,
            seed=tf_seed)

        run_config_additional = tf.contrib.training.HParams(
            dataset_hparams=dataset_hparams,
            model_dir=model_dir
            if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            log_dir=log_dir
            if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            sample_dir=sample_dir
            if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None,
            data_dir=data_dir,
            num_preprocessing_threads=32,
        )

        if not hvd_utils.is_using_hvd() or hvd.rank() == 0:
            try:
                os.makedirs(sample_dir)
            except FileExistsError:
                pass

        self.run_hparams = Runner._build_hparams(model_hparams,
                                                 run_config_additional)

        if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
            LOGGER.log('Defining Model Estimator ...\n')

        self._model = UNet_v1(
            model_name="UNet_v1",
            input_format=self.run_hparams.input_format,
            compute_format=self.run_hparams.compute_format,
            n_output_channels=1,
            unet_variant=self.run_hparams.model_variant,
            weight_init_method=self.run_hparams.weight_init_method,
            activation_fn=self.run_hparams.activation_fn)

        if self.run_hparams.seed is not None:

            if not hvd_utils.is_using_hvd() or hvd.local_rank() == 0:
                LOGGER.log("Deterministic Run - Seed: %d\n" % seed)

            tf.set_random_seed(self.run_hparams.seed)
            np.random.seed(self.run_hparams.seed)
            random.seed(self.run_hparams.seed)

        if dataset_name not in known_datasets.keys():
            raise RuntimeError(
                "The dataset `%s` is unknown, allowed values: %s ..." %
                (dataset_name, list(known_datasets.keys())))

        self.dataset = known_datasets[dataset_name](
            data_dir=data_dir, **self.run_hparams.dataset_hparams)

        self.num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size()
def main(_):
    """
    Starting point of the application
    """
    hvd.init()
    set_flags()
    params = parse_args(PARSER.parse_args())
    model_dir = prepare_model_dir(params)
    logger = get_logger(params)

    estimator = build_estimator(params, model_dir)

    dataset = Dataset(data_dir=params.data_dir,
                      batch_size=params.batch_size,
                      fold=params.crossvalidation_idx,
                      augment=params.augment,
                      gpu_id=hvd.rank(),
                      num_gpus=hvd.size(),
                      seed=params.seed)

    if 'train' in params.exec_mode:
        max_steps = params.max_steps // (1 if params.benchmark else hvd.size())
        hooks = [hvd.BroadcastGlobalVariablesHook(0),
                 TrainingHook(logger,
                              max_steps=max_steps,
                              log_every=params.log_every)]

        if params.benchmark and hvd.rank() == 0:
            hooks.append(ProfilingHook(logger,
                                       batch_size=params.batch_size,
                                       log_every=params.log_every,
                                       warmup_steps=params.warmup_steps,
                                       mode='train'))

        estimator.train(
            input_fn=dataset.train_fn,
            steps=max_steps,
            hooks=hooks)

    if 'evaluate' in params.exec_mode:
        if hvd.rank() == 0:
            results = estimator.evaluate(input_fn=dataset.eval_fn, steps=dataset.eval_size)
            logger.log(step=(),
                       data={"eval_ce_loss": float(results["eval_ce_loss"]),
                             "eval_dice_loss": float(results["eval_dice_loss"]),
                             "eval_total_loss": float(results["eval_total_loss"]),
                             "eval_dice_score": float(results["eval_dice_score"])})

    if 'predict' in params.exec_mode:
        if hvd.rank() == 0:
            predict_steps = dataset.test_size
            hooks = None
            if params.benchmark:
                hooks = [ProfilingHook(logger,
                                       batch_size=params.batch_size,
                                       log_every=params.log_every,
                                       warmup_steps=params.warmup_steps,
                                       mode="test")]
                predict_steps = params.warmup_steps * 2 * params.batch_size

            predictions = estimator.predict(
                input_fn=lambda: dataset.test_fn(count=math.ceil(predict_steps / dataset.test_size)),
                hooks=hooks)
            binary_masks = [np.argmax(p['logits'], axis=-1).astype(np.uint8) * 255 for p in predictions]

            if not params.benchmark:
                multipage_tif = [Image.fromarray(mask).resize(size=(512, 512), resample=Image.BILINEAR)
                                 for mask in binary_masks]

                output_dir = os.path.join(params.model_dir, 'pred')

                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                multipage_tif[0].save(os.path.join(output_dir, 'test-masks.tif'),
                                      compression="tiff_deflate",
                                      save_all=True,
                                      append_images=multipage_tif[1:])
Beispiel #12
0
def main(args):
    # Initialize horovod
    hvd.init()

    gpus = tf.config.list_physical_devices("GPU")
    tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU")

    # Generate local filename
    # Assume the dataset has been splited in advance
    local_file = args.data_filename_prefix + str(hvd.local_rank()) + ".file"

    # generate local batch size
    assert (args.global_batch_size % hvd.size() == 0)
    local_batch_size = args.global_batch_size // hvd.size()

    dataset = utility.TFDataset(filename=local_file,
                                batchsize=local_batch_size,
                                as_sparse_tensor=False,
                                repeat=1)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    # Because there is no tensorflow distribute strategy, sok.Init() will call horovod to
    # broadcast nccl id and random seed, so it must be called after hvd.init()
    sok.Init(global_batch_size=args.global_batch_size)

    model = SOKDenseDemo(
        max_vocabulary_size_per_gpu=args.max_vocabulary_size_per_gpu,
        embedding_vec_size=args.embedding_vec_size,
        slot_num=args.slot_num,
        nnz_per_slot=args.nnz_per_slot,
        num_dense_layers=args.num_dense_layers)

    embedding_optimizer = utility.get_embedding_optimizer(
        args.optimizer)(learning_rate=0.1)
    dense_optimizer = utility.get_dense_optimizer(
        args.optimizer)(learning_rate=0.1)

    loss_fn = tf.keras.losses.BinaryCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    def _replica_loss(labels, logits):
        loss = loss_fn(labels, logits)
        return tf.nn.compute_average_loss(
            loss, global_batch_size=args.global_batch_size)

    @tf.function
    def _train_step(inputs, labels, first_batch):
        with tf.GradientTape() as tape, tf.GradientTape() as emb_tape:
            logit = model(inputs, training=True)
            replica_loss = _replica_loss(labels, logit)

        # Horovod: wrap tf.GradientTape with Horovod DistributedGradientTape
        tape = hvd.DistributedGradientTape(tape)

        # There is no need to wrap the emb_tape because the communication is done by sok
        # emb_tape = hvd.DistributedGradientTape(emb_tape)

        emb_variable, other_variable = sok.split_embedding_variable_from_others(
            model.trainable_variables)

        # type(emb_tape) here is hvd.DistributedGradientTape
        # type(tape) here is tf.GradientTape
        emb_grads = emb_tape.gradient(replica_loss, emb_variable)
        grads = tape.gradient(replica_loss, other_variable)

        if "plugin" not in args.optimizer:
            with sok.OptimizerScope(emb_variable):
                embedding_optimizer.apply_gradients(
                    zip(emb_grads, emb_variable),
                    experimental_aggregate_gradients=False)
        else:
            embedding_optimizer.apply_gradients(
                zip(emb_grads, emb_variable),
                experimental_aggregate_gradients=False)
        dense_optimizer.apply_gradients(zip(grads, other_variable))

        # Note: broadcast should be done after the first gradient step to ensure optimizer has been initialized.
        # There is no need to broadcast emb_variable and embedding_optimizer, because the parallel mode inside
        # sok is model parallel and the communication is down by sok itself.
        if first_batch:
            hvd.broadcast_variables(other_variable, root_rank=0)
            hvd.broadcast_variables(dense_optimizer.variables(), root_rank=0)

        return replica_loss

    for i, (inputs, labels) in enumerate(dataset):
        if args.stop_at_iter > 0 and i >= args.stop_at_iter:
            break

        rng = nvtx.start_range(message="Iteration_" + str(i), color="blue")

        total_loss = _train_step(inputs, labels, i == 0)

        nvtx.end_range(rng)
        print("[INFO]: Iteration: {}, loss={}".format(i, total_loss))
def main(input_path,blocks,weights,image_dir,checkpoint_dir,trn_sz,learning_rate):
    #init horovod
    comm_rank = 0 
    comm_local_rank = 0
    comm_size = 1
    if horovod:
        hvd.init()
        comm_rank = hvd.rank() 
        comm_local_rank = hvd.local_rank()
        comm_size = hvd.size()
        if comm_rank == 0:
            print("Using distributed computation with Horovod: {} total ranks".format(comm_size,comm_rank))
        
    #parameters
    batch = 1
    channels = [0,1,2,10]
    num_epochs = 150
    dtype = tf.float32
    
    #session config
    sess_config=tf.ConfigProto(inter_op_parallelism_threads=2, #1
                               intra_op_parallelism_threads=33, #6
                               log_device_placement=False,
                               allow_soft_placement=True)
    sess_config.gpu_options.visible_device_list = str(comm_local_rank)
    
    #get data
    training_graph = tf.Graph()
    if comm_rank == 0:
        print("Loading data...")
    path, trn_data, val_data, tst_data = load_data(input_path,comm_size,comm_rank,trn_sz)
    if comm_rank == 0:
        print("Shape of trn_data is {}".format(trn_data.shape[0]))
        print("done.")
    
    with training_graph.as_default():
        #create datasets
        #files = tf.placeholder(tf.string, shape=[None])
        trn_manager = h5_input_manager(path, trn_data, channels, update_on_read=True)
        trn_dataset = create_dataset(trn_manager, batch, num_epochs, shuffle=True)
        val_manager = h5_input_manager(path, val_data, channels, update_on_read=False)
        val_dataset = create_dataset(val_manager, batch, 1, shuffle=False)
        
        #create iterators
        handle = tf.placeholder(tf.string, shape=[], name="iterator-placeholder")
        iterator = tf.data.Iterator.from_string_handle(handle, (tf.float32, tf.int32), 
                                                       ((batch, len(channels), image_height, image_width),
                                                       (batch, image_height, image_width))
                                                       )
        next_elem = iterator.get_next()
        
        #create init handles
        #trn
        trn_iterator = trn_dataset.make_initializable_iterator()
        trn_handle_string = trn_iterator.string_handle()
        trn_init_op = iterator.make_initializer(trn_dataset)
        #val
        val_iterator = val_dataset.make_initializable_iterator()
        val_handle_string = val_iterator.string_handle()
        val_init_op = iterator.make_initializer(val_dataset)

        #set up model
        logit, prediction, weight = create_tiramisu(3, next_elem[0], image_height, image_width, len(channels), loss_weights=weights, nb_layers_per_block=blocks, p=0.2, wd=1e-4, dtype=dtype)
        
        #set up loss
        labels_one_hot = tf.contrib.layers.one_hot_encoding(next_elem[1], 3)
        weighted_labels_one_hot = tf.multiply(labels_one_hot, weight)
        loss = tf.losses.softmax_cross_entropy(onehot_labels=weighted_labels_one_hot,logits=logit)
        #loss = tf.losses.sparse_softmax_cross_entropy(labels=next_elem[1],logits=logit,weights=weight)
        
        #set up global step
        global_step = tf.train.get_or_create_global_step()
        
        #set up optimizer
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        if horovod:
            opt = hvd.DistributedOptimizer(opt)
        train_op = opt.minimize(loss, global_step=global_step)
        #set up streaming metrics
        iou_op, iou_update_op = tf.metrics.mean_iou(prediction,labels_one_hot,3,weights=None,metrics_collections=None,updates_collections=None,name="iou_score")
        
        #compute epochs and stuff:
        num_samples = trn_data.shape[0] // comm_size
        num_steps_per_epoch = num_samples // batch
        num_steps = num_epochs*num_steps_per_epoch
        
        #hooks
        #these hooks are essential. regularize the step hook by adding one additional step at the end
        hooks = [tf.train.StopAtStepHook(last_step=num_steps+1)]
        if horovod:
            hooks.append(hvd.BroadcastGlobalVariablesHook(0))
        #initializers:
        init_op =  tf.global_variables_initializer()
        init_local_op = tf.local_variables_initializer()
        
        #checkpointing
        if comm_rank == 0:
            checkpoint_save_freq = num_steps_per_epoch * 10
            checkpoint_saver = tf.train.Saver(max_to_keep = 1000)
            hooks.append(tf.train.CheckpointSaverHook(checkpoint_dir=checkpoint_dir, save_steps=checkpoint_save_freq, saver=checkpoint_saver))
            #create image dir if not exists
            if not os.path.isdir(image_dir):
                os.makedirs(image_dir)
        
        ##DEBUG
        ##summary
        #if comm_rank == 0:
        #    print("write graph for debugging")
        #    tf.summary.scalar("loss",loss)
        #    summary_op = tf.summary.merge_all()
        #    #hooks.append(tf.train.SummarySaverHook(save_steps=num_steps_per_epoch, summary_writer=summary_writer, summary_op=summary_op))
        #    with tf.Session(config=sess_config) as sess:
        #        sess.run([init_op, init_local_op])
        #        #create iterator handles
        #        trn_handle = sess.run(trn_handle_string)
        #        #init iterators
        #        sess.run(trn_init_op, feed_dict={handle: trn_handle, datafiles: trn_data, labelfiles: trn_labels})
        #        #summary:
        #        sess.run(summary_op, feed_dict={handle: trn_handle})
        #        #summary file writer
        #        summary_writer = tf.summary.FileWriter('./logs', sess.graph)
        ##DEBUG
        

        #start session
        with tf.train.MonitoredTrainingSession(config=sess_config, hooks=hooks) as sess:
            #initialize
            sess.run([init_op, init_local_op])
            #create iterator handles
            trn_handle, val_handle = sess.run([trn_handle_string, val_handle_string])
            #init iterators
            sess.run(trn_init_op, feed_dict={handle: trn_handle})
            sess.run(val_init_op, feed_dict={handle: val_handle})

            #do the training
            epoch = 1
            train_loss = 0.
            start_time = time.time()
            while not sess.should_stop():
                
                #training loop
                try:
                    #construct feed dict
                    _, _, train_steps, tmp_loss = sess.run([train_op, iou_update_op, global_step, loss], feed_dict={handle: trn_handle})
                    train_steps_in_epoch = train_steps%num_steps_per_epoch
                    train_loss += tmp_loss
                    
                    if train_steps_in_epoch > 0:
                        #print step report
                        print("REPORT: rank {}, training loss for step {} (of {}) is {}".format(comm_rank, train_steps, num_steps, train_loss/train_steps_in_epoch))
                    else:
                        end_time = time.time()
                        #print epoch report
                        train_loss /= num_steps_per_epoch
                        print("COMPLETED: rank {}, training loss for epoch {} (of {}) is {}, epoch duration {} s".format(comm_rank, epoch, num_epochs, train_loss, end_time - start_time))
                        iou_score = sess.run(iou_op)
                        print("COMPLETED: rank {}, training IoU for epoch {} (of {}) is {}, epoch duration {} s".format(comm_rank, epoch, num_epochs, iou_score, end_time - start_time))
                        start_time = time.time()
                        
                        #evaluation loop
                        eval_loss = 0.
                        eval_steps = 0
                        #update the input reader
                        val_manager.minvals = trn_manager.minvals
                        val_manager.maxvals = trn_manager.maxvals
                        while True:
                            try:
                                #construct feed dict
                                _, tmp_loss, val_model_predictions, val_model_labels = sess.run([iou_update_op, loss, prediction, next_elem[1]], feed_dict={handle: val_handle})
                                if use_scipy:
                                    imsave(image_dir+'/test_pred_epoch'+str(epoch)+'_estep'
                                            +str(eval_steps)+'_rank'+str(comm_rank)+'.png',np.argmax(val_model_predictions[0,...],axis=2)*100)
                                    imsave(image_dir+'/test_label_epoch'+str(epoch)+'_estep'
                                            +str(eval_steps)+'_rank'+str(comm_rank)+'.png',val_model_labels[0,...]*100)
                                else:
                                    np.save(image_dir+'/test_pred_epoch'+str(epoch)+'_estep'
                                            +str(eval_steps)+'_rank'+str(comm_rank)+'.npy',np.argmax(val_model_predictions[0,...],axis=2)*100)
                                    np.save(image_dir+'/test_label_epoch'+str(epoch)+'_estep'
                                            +str(eval_steps)+'_rank'+str(comm_rank)+'.npy',val_model_labels[0,...]*100)
                                eval_loss += tmp_loss
                                eval_steps += 1
                            except tf.errors.OutOfRangeError:
                                eval_steps = np.max([eval_steps,1])
                                eval_loss /= eval_steps
                                print("COMPLETED: rank {}, evaluation loss for epoch {} (of {}) is {}".format(comm_rank, epoch-1, num_epochs, eval_loss))
                                iou_score = sess.run(iou_op)
                                print("COMPLETED: rank {}, evaluation IoU for epoch {} (of {}) is {}".format(comm_rank, epoch-1, num_epochs, iou_score))
                                sess.run(val_init_op, feed_dict={handle: val_handle})
                                break
                                
                        #reset counters
                        epoch += 1
                        train_loss = 0.
                    
                except tf.errors.OutOfRangeError:
                    break
def horovod():
    hvd.init()
Beispiel #15
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            'At least one of `do_train` or `do_eval` must be True.')
    if FLAGS.horovod:
        import horovod.tensorflow as hvd
        hvd.init()
        tmp_outptu_dir = ('{0}-tmp-{1}').format(FLAGS.output_dir,
                                                hvd.local_rank())
        FLAGS.output_dir = FLAGS.output_dir if hvd.rank(
        ) == 0 else tmp_outptu_dir
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            'Cannot use sequence length %d because the BERT model was only trained up to sequence length %d'
            % (FLAGS.max_seq_length, bert_config.max_position_embeddings))
    tf.gfile.MakeDirs(FLAGS.output_dir)
    input_files = []
    for input_pattern in FLAGS.input_file.split(','):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info(
        ('*** Input Files ***: {0} files').format(len(input_files)))
    if len(input_files) == 0:
        raise ValueError('No input file is found')
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    config = tf.ConfigProto()

    #    from tensorflow.core.protobuf import rewriter_config_pb2
    #    rewrite_config = rewriter_config_pb2.RewriterConfig(
    #          auto_mixed_precision=rewriter_config_pb2.RewriterConfig.ON,
    #          # do not remove duplicated nodes
    #          arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
    #    rewrite_config.min_graph_nodes = -1
    #    graph_options = tf.GraphOptions(
    #        rewrite_options=rewrite_config, build_cost_model=1)
    #
    #    config = tf.ConfigProto(graph_options=graph_options)
    #
    #    config.graph_options.optimizer_options.opt_level = -1

    #    if not FLAGS.use_fp16:
    #      config.graph_options.rewrite_options.auto_mixed_precision = (
    #        rewriter_config_pb2.RewriterConfig.ON)

    if FLAGS.horovod:
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.do_common_subexpression_elimination = True
        config.graph_options.optimizer_options.do_constant_folding = True
        config.graph_options.optimizer_options.do_function_inlining = True
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2
        config.gpu_options.allow_growth = True
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        keep_checkpoint_max=1000
        if not FLAGS.horovod or hvd.rank() == 0 else 1,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if not FLAGS.horovod or hvd.rank() == 0 else None,
        session_config=config,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        log_step_count_steps=10000 *
        FLAGS.iterations_per_loop if FLAGS.report_loss else 100 *
        FLAGS.iterations_per_loop)
    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu,
                                hvd=None if not FLAGS.horovod else hvd)
    training_hooks = []
    if FLAGS.horovod and hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
        global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size(
        )
        training_hooks.append(
            _LogSessionRunHook(global_batch_size, 100,
                               -1 if not FLAGS.horovod else hvd.rank()))
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)
    if FLAGS.do_train:
        tf.logging.info('***** Running training *****')
        tf.logging.info('  Batch size = %d', FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True,
            hvd=None if not FLAGS.horovod else hvd)
        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=FLAGS.num_train_steps)
    if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
        tf.logging.info('***** Running evaluation *****')
        tf.logging.info('  Batch size = %d', FLAGS.eval_batch_size)
        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False,
            hvd=None if not FLAGS.horovod else hvd)
        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)
        output_eval_file = os.path.join(FLAGS.output_dir, 'eval_results.txt')
        with tf.gfile.GFile(output_eval_file, 'w') as (writer):
            tf.logging.info('***** Eval results *****')
            for key in sorted(result.keys()):
                tf.logging.info('  %s = %s', key, str(result[key]))
                writer.write('%s = %s\n' % (key, str(result[key])))

    return
Beispiel #16
0
def main(e2e_start_time):
    # Parse essential argumentss
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", required=True)
    parser.add_argument("--model_size",
                        default="base",
                        type=str,
                        help="base or large")
    parser.add_argument("--pretrain_tfrecords", type=str)
    parser.add_argument("--phase2", action='store_true')
    parser.add_argument("--fp16_compression", action='store_true')
    parser.add_argument("--amp",
                        action='store_true',
                        help="Whether to use fp16.")
    parser.add_argument("--xla",
                        action='store_true',
                        help="Whether to use xla.")
    parser.add_argument("--seed", default=42, type=int)
    parser.add_argument("--num_train_steps", type=int)
    parser.add_argument("--num_warmup_steps", type=int)
    parser.add_argument("--learning_rate", type=float)
    parser.add_argument("--train_batch_size", type=int)
    parser.add_argument("--max_seq_length", type=int)

    parser.add_argument("--mask_prob", type=float)
    parser.add_argument("--disc_weight", type=float)
    parser.add_argument("--generator_hidden_size", type=float)

    parser.add_argument("--log_freq",
                        type=int,
                        default=10,
                        help="Training metrics logging frequency")
    parser.add_argument("--save_checkpoints_steps", type=int)
    parser.add_argument("--keep_checkpoint_max", type=int)
    parser.add_argument("--restore_checkpoint", default=None, type=str)
    parser.add_argument("--load_weights", action='store_true')
    parser.add_argument("--weights_dir")

    parser.add_argument("--optimizer",
                        default="adam",
                        type=str,
                        help="adam or lamb")
    parser.add_argument(
        "--skip_adaptive",
        action='store_true',
        help="Whether to apply adaptive LR on LayerNorm and biases")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Number of Gradient Accumulation steps")
    parser.add_argument("--lr_decay_power",
                        type=float,
                        default=0.5,
                        help="LR decay power")
    parser.add_argument("--opt_beta_1",
                        type=float,
                        default=0.878,
                        help="Optimizer beta1")
    parser.add_argument("--opt_beta_2",
                        type=float,
                        default=0.974,
                        help="Optimizer beta2")
    parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR")
    parser.add_argument("--log_dir",
                        type=str,
                        default=None,
                        help="Path to store logs")
    parser.add_argument("--results_dir",
                        type=str,
                        default=None,
                        help="Path to store all model results")
    parser.add_argument("--skip_checkpoint",
                        action='store_true',
                        default=False,
                        help="Path to store logs")
    parser.add_argument(
        '--json-summary',
        type=str,
        default=None,
        help=
        'If provided, the json summary will be written to the specified file.')
    args = parser.parse_args()
    config = PretrainingConfig(**args.__dict__)
    # Padding for divisibility by 8
    if config.vocab_size % 8 != 0:
        config.vocab_size += 8 - (config.vocab_size % 8)

    # Set up tensorflow
    hvd.init()

    args.log_dir = config.log_dir
    # DLLogger
    setup_logger(args)

    set_affinity(hvd.local_rank())
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
    tf.config.optimizer.set_jit(config.xla)
    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp})

    if config.amp:
        policy = tf.keras.mixed_precision.experimental.Policy(
            "mixed_float16", loss_scale="dynamic")
        tf.keras.mixed_precision.experimental.set_policy(policy)
        print('Compute dtype: %s' %
              policy.compute_dtype)  # Compute dtype: float16
        print('Variable dtype: %s' %
              policy.variable_dtype)  # Variable dtype: float32

    #tf.random.set_seed(config.seed)

    # Set up config cont'
    if config.load_weights and config.restore_checkpoint:
        raise ValueError(
            "`load_weights` and `restore_checkpoint` should not be on at the same time."
        )
    if config.phase2 and not config.restore_checkpoint:
        raise ValueError(
            "`phase2` cannot be used without `restore_checkpoint`.")
    utils.heading("Config:")
    log_config(config)

    # Save pretrain configs
    pretrain_config_json = os.path.join(config.checkpoints_dir,
                                        'pretrain_config.json')
    if is_main_process():
        utils.write_json(config.__dict__, pretrain_config_json)
        log("Configuration saved in {}".format(pretrain_config_json))

    # Set up model
    model = PretrainingModel(config)

    # Set up metrics
    metrics = dict()
    metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")
    metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
    metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
        name="masked_lm_accuracy")
    metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss")
    if config.electra_objective:
        metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
            name="sampled_masked_lm_accuracy")
        if config.disc_weight > 0:
            metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
            metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
            metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(
                name="disc_accuracy")
            metrics["disc_precision"] = tf.keras.metrics.Accuracy(
                name="disc_precision")
            metrics["disc_recall"] = tf.keras.metrics.Accuracy(
                name="disc_recall")

    # Set up tensorboard
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = os.path.join(
        config.log_dir, current_time,
        'train_' + str(get_rank()) + '_of_' + str(get_world_size()))
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Set up dataset
    dataset = pretrain_utils.get_dataset(config,
                                         config.train_batch_size,
                                         world_size=get_world_size(),
                                         rank=get_rank())
    train_iterator = iter(dataset)

    # Set up optimizer
    optimizer = create_optimizer(init_lr=config.learning_rate,
                                 num_train_steps=config.num_train_steps,
                                 num_warmup_steps=config.num_warmup_steps,
                                 weight_decay_rate=config.weight_decay_rate,
                                 optimizer=config.optimizer,
                                 skip_adaptive=config.skip_adaptive,
                                 power=config.lr_decay_power,
                                 beta_1=config.opt_beta_1,
                                 beta_2=config.opt_beta_2,
                                 end_lr=config.end_lr)

    accumulator = GradientAccumulator()
    if config.amp:
        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, "dynamic")

    # Set up model checkpoint
    checkpoint = tf.train.Checkpoint(step=tf.Variable(0),
                                     phase2=tf.Variable(False),
                                     optimizer=optimizer,
                                     model=model)
    manager = tf.train.CheckpointManager(
        checkpoint,
        config.checkpoints_dir,
        max_to_keep=config.keep_checkpoint_max)
    if config.restore_checkpoint and config.restore_checkpoint != "latest":
        checkpoint.restore(config.restore_checkpoint)
        log(" ** Restored model checkpoint from {}".format(
            config.restore_checkpoint))
    elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint:
        checkpoint.restore(manager.latest_checkpoint)
        log(" ** Restored model checkpoint from {}".format(
            manager.latest_checkpoint))
    elif config.load_weights:
        model.generator(model.generator.dummy_inputs)
        model.discriminator(model.discriminator.dummy_inputs)
        model.generator.load_weights(
            os.path.join(config.weights_dir, 'generator', 'tf_model.h5'))
        model.discriminator.load_weights(
            os.path.join(config.weights_dir, 'discriminator', 'tf_model.h5'))
    else:
        log(" ** Initializing from scratch.")

    restore_iterator = bool(
        config.restore_checkpoint) and config.restore_checkpoint == "latest"
    # Initialize global step for phase2
    if config.phase2 and not bool(checkpoint.phase2):
        optimizer.iterations.assign(0)
        checkpoint.step.assign(0)
        checkpoint.phase2.assign(True)
        restore_iterator = False
    if bool(checkpoint.phase2):
        manager = tf.train.CheckpointManager(
            checkpoint,
            config.checkpoints_dir,
            checkpoint_name='ckpt-p2',
            max_to_keep=config.keep_checkpoint_max)

    # Set up iterator checkpoint
    iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator,
                                          world_size=tf.Variable(
                                              get_world_size()),
                                          rank=tf.Variable(get_rank()))
    iter_manager = tf.train.CheckpointManager(
        iter_checkpoint,
        os.path.join(config.checkpoints_dir,
                     'iter_ckpt_rank_' + '{:02}'.format(get_rank())),
        checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()),
        max_to_keep=config.keep_checkpoint_max)
    if restore_iterator and iter_manager.latest_checkpoint:
        ckpt_world_size = tf.train.load_variable(
            iter_manager.latest_checkpoint,
            'world_size/.ATTRIBUTES/VARIABLE_VALUE')
        if ckpt_world_size == get_world_size():
            iter_checkpoint.restore(iter_manager.latest_checkpoint)
            log(" ** Restored iterator checkpoint from {}".format(
                iter_manager.latest_checkpoint),
                all_rank=True)

    utils.heading("Running training")
    accumulator.reset()
    train_start, start_step = time.time(), int(checkpoint.step) - 1
    local_step = 0
    saved_ckpt = False
    while int(checkpoint.step) <= config.num_train_steps:
        saved_ckpt = False
        step = int(checkpoint.step)
        features = next(train_iterator)
        iter_start = time.time()

        # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
        total_loss, eval_fn_inputs = train_one_step(
            config,
            model,
            optimizer,
            features,
            accumulator,
            local_step == 1,
            take_step=local_step % args.gradient_accumulation_steps == 0)
        # if step == 300: tf.profiler.experimental.stop()

        metrics["train_perf"].update_state(config.train_batch_size *
                                           get_world_size() /
                                           (time.time() - iter_start))
        metrics["total_loss"].update_state(values=total_loss)
        metric_fn(config, metrics, eval_fn_inputs)

        if (step % args.log_freq
                == 0) and (local_step % args.gradient_accumulation_steps == 0):
            log_info_dict = {
                k: float(v.result().numpy() *
                         100) if "accuracy" in k else float(v.result().numpy())
                for k, v in metrics.items()
            }
            dllogger.log(step=(step, ), data=log_info_dict, verbosity=0)
            log('Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, '
                .format(step=step,
                        **log_info_dict,
                        loss_scale=optimizer.loss_scale if config.amp else 1,
                        elapsed=utils.get_readable_time(time.time() -
                                                        train_start),
                        eta=utils.get_readable_time(
                            (time.time() - train_start) / (step - start_step) *
                            (config.num_train_steps - step))),
                all_rank=True)

            with train_summary_writer.as_default():
                for key, m in metrics.items():
                    tf.summary.scalar(key, m.result(), step=step)

            if int(checkpoint.step) < config.num_train_steps:
                for m in metrics.values():
                    m.reset_states()

        #Print allreduced metrics on the last step
        if int(checkpoint.step) == config.num_train_steps and (
                local_step % args.gradient_accumulation_steps == 0):
            log_info_dict = {
                k: float(hvd.allreduce(v.result()).numpy() * 100) if "accuracy"
                in k else float(hvd.allreduce(v.result()).numpy())
                for k, v in metrics.items()
            }
            log_info_dict["training_sequences_per_second"] = log_info_dict[
                "train_perf"]
            log_info_dict["final_loss"] = log_info_dict["total_loss"]
            log_info_dict["e2e_train_time"] = time.time() - e2e_start_time
            dllogger.log(step=(), data=log_info_dict, verbosity=0)
            log('<FINAL STEP METRICS> Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},'.
                format(step=step, **log_info_dict),
                all_rank=False)

        if local_step % args.gradient_accumulation_steps == 0:
            checkpoint.step.assign(int(optimizer.iterations))

        local_step += 1
        if not config.skip_checkpoint and (
                local_step %
            (config.save_checkpoints_steps * args.gradient_accumulation_steps)
                == 0):
            saved_ckpt = True
            if is_main_process():
                save_path = manager.save(checkpoint_number=step)
                log(" ** Saved model checkpoint for step {}: {}".format(
                    step, save_path))
            iter_save_path = iter_manager.save(checkpoint_number=step)
            log(" ** Saved iterator checkpoint for step {}: {}".format(
                step, iter_save_path),
                all_rank=True)

    step = (int(checkpoint.step) - 1)
    dllogger.flush()
    if not config.skip_checkpoint and not saved_ckpt:
        if is_main_process():
            save_path = manager.save(checkpoint_number=step)
            log(" ** Saved model checkpoint for step {}: {}".format(
                step, save_path))
        iter_save_path = iter_manager.save(checkpoint_number=step)
        log(" ** Saved iterator checkpoint for step {}: {}".format(
            step, iter_save_path),
            all_rank=True)

    return args
Beispiel #17
0
def train(model,
          loss_fn,
          Dataset=None,
          dataset=None,
          valid_dataset=None,
          valid_dataset2=None,
          test_dataset=None,
          evaluate_fn=None,
          inference_fn=None,
          eval_fn=None,
          write_valid=True,
          valid_names=None,
          infer_names=None,
          infer_debug_names=None,
          valid_write_fn=None,
          infer_write_fn=None,
          valid_suffix='.valid',
          infer_suffix='.infer',
          write_streaming=False,
          optimizer=None,
          param_groups=None,
          init_fn=None,
          sep=','):
    use_horovod = 'OMPI_COMM_WORLD_RANK' in os.environ

    if Dataset is None:
        assert dataset
    logging.info('Dataset', Dataset, 'dataset', dataset, 'valid_dataset',
                 valid_dataset, 'test_dataset', test_dataset, loss_fn)

    if FLAGS.torch:
        torch.manual_seed(FLAGS.seed or 0)
        if torch.cuda.device_count():
            torch.cuda.manual_seed(FLAGS.seed or 0)
        if use_horovod:
            import horovod.torch as hvd
            hvd.init()
            #print('-----------------', hvd, hvd.size())
            assert hvd.mpi_threads_supported()
            assert hvd.size() == comm.Get_size()
            # hvd.init already done on apps.train.py init
            torch.cuda.set_device(hvd.local_rank())
        # https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
        else:
            if torch.cuda.device_count() > 1:
                model = torch.nn.DataParallel(model)
        model.to(device)

    input_ = FLAGS.train_input
    inputs = gezi.list_files(input_)
    inputs.sort()

    all_inputs = inputs

    #batch_size = FLAGS.batch_size
    batch_size = melt.batch_size()

    num_gpus = melt.num_gpus()

    #batch_size = max(batch_size, 1)
    #batch_size_ = batch_size if not FLAGS.batch_sizes else int(FLAGS.batch_sizes.split(',')[-1])
    batch_size_ = FLAGS.eval_batch_size or batch_size

    if dataset is None:
        if FLAGS.fold is not None:
            inputs = [
                x for x in inputs if not x.endswith('%d.record' % FLAGS.fold)
                and not x.endswith('%d.tfrecord' % FLAGS.fold)
            ]
            # if FLAGS.valid_input:
            #   inputs += [x for x in gezi.list_files(FLAGS.valid_input) if not x.endswith('%d.record' % FLAGS.fold)]
        logging.info('inputs', len(inputs), inputs[:100])
    num_folds = FLAGS.num_folds or len(inputs) + 1

    if dataset is None:
        dataset = Dataset('train')
        assert len(inputs) > 0
        train_dataset = dataset.make_batch(batch_size,
                                           inputs,
                                           simple_parse=FLAGS.simple_parse)
        num_examples = dataset.num_examples_per_epoch('train')
    else:
        assert FLAGS.torch_only, 'only torch only currently support input dataset not Dataset class type, because we do not have len function there'
        train_dataset = dataset
        num_examples = len(train_dataset)

    num_all_examples = num_examples

    if valid_dataset is None:
        valid_inputs = None
        if FLAGS.valid_input:
            valid_inputs = gezi.list_files(FLAGS.valid_input)
        else:
            if FLAGS.fold is not None:
                #valid_inputs = [x for x in all_inputs if x not in inputs]
                if not FLAGS.test_aug:
                    valid_inputs = [
                        x for x in all_inputs
                        if not 'aug' in x and x not in inputs
                    ]
                else:
                    valid_inputs = [
                        x for x in all_inputs if 'aug' in x and x not in inputs
                    ]

        logging.info('valid_inputs', valid_inputs)

    num_valid_examples = None
    if valid_dataset is not None:
        num_valid_examples = len(valid_dataset)
    else:
        if valid_inputs:
            valid_dataset = dataset.make_batch(batch_size_,
                                               valid_inputs,
                                               subset='valid',
                                               hvd_shard=FLAGS.horovod_eval)
            valid_dataset2 = dataset.make_batch(batch_size,
                                                valid_inputs,
                                                subset='valid',
                                                repeat=True,
                                                initializable=False,
                                                hvd_shard=False)
            valid_dataset2_iter = iter(valid_dataset2)
        else:
            valid_datsset = None
            valid_dataset2 = None

    if num_examples:
        if FLAGS.fold is not None:
            num_examples = int(num_examples * (num_folds - 1) / num_folds)
        num_steps_per_epoch = -(-num_examples // batch_size)
    else:
        num_steps_per_epoch = None
    logging.info('num_train_examples:', num_examples)
    if use_horovod and num_examples:
        num_steps_per_epoch = -(-num_examples // (batch_size * hvd.size()))

    if num_valid_examples is None:
        if FLAGS.valid_input:
            num_valid_examples = dataset.num_examples_per_epoch('valid')
            num_valid_steps_per_epoch = -(-num_valid_examples // batch_size_
                                          ) if num_valid_examples else None
        else:
            if FLAGS.fold is not None:
                if num_examples:
                    num_valid_examples = int(num_all_examples *
                                             (1 / num_folds))
                    num_valid_steps_per_epoch = -(-num_valid_examples //
                                                  batch_size_)
                else:
                    num_valid_steps_per_epoch = None
    if use_horovod and FLAGS.horovod_eval and num_valid_examples:
        num_valid_steps_per_epoch = -(-num_valid_examples //
                                      (batch_size_ * hvd.size()))
    logging.info('num_valid_examples:', num_valid_examples)

    if test_dataset is None:
        if FLAGS.test_input:
            test_inputs = gezi.list_files(FLAGS.test_input)
            #test_inputs = [x for x in test_inputs if not 'aug' in x]
            logging.info('test_inputs', test_inputs)
        else:
            test_inputs = None

    num_test_examples = None
    if test_dataset is not None:
        num_test_examples = len(test_dataset)
    else:
        if test_inputs:
            test_dataset = dataset.make_batch(batch_size_,
                                              test_inputs,
                                              subset='test')
            num_test_examples = dataset.num_examples_per_epoch('test')
        else:
            test_dataset = None
    num_test_steps_per_epoch = -(-num_test_examples //
                                 batch_size_) if num_test_examples else None
    if use_horovod and FLAGS.horovod_eval and num_test_examples:
        num_test_steps_per_epoch = -(-num_test_examples //
                                     (batch_size_ * hvd.size()))
    logging.info('num_test_examples:', num_test_examples)

    summary = tf.contrib.summary
    # writer = summary.create_file_writer(FLAGS.log_dir + '/epoch')
    # writer_train = summary.create_file_writer(FLAGS.log_dir + '/train')
    # writer_valid = summary.create_file_writer(FLAGS.log_dir + '/valid')
    writer = summary.create_file_writer(FLAGS.log_dir)
    writer_train = summary.create_file_writer(FLAGS.log_dir)
    writer_valid = summary.create_file_writer(FLAGS.log_dir)
    global_step = tf.train.get_or_create_global_step()
    ## RuntimeError: tf.summary.FileWriter is not compatible with eager execution. Use tf.contrib.summary instead.
    #logger = gezi.SummaryWriter(FLAGS.log_dir)

    learning_rate = tfe.Variable(FLAGS.learning_rate, name="learning_rate")

    tf.add_to_collection('learning_rate', learning_rate)

    learning_rate_weight = tf.get_collection('learning_rate_weight')[-1]
    try:
        learning_rate_weights = tf.get_collection('learning_rate_weights')[-1]
    except Exception:
        learning_rate_weights = None

    # ckpt dir save models one per epoch
    ckpt_dir = os.path.join(FLAGS.model_dir, 'ckpt')
    os.system('mkdir -p %s' % ckpt_dir)
    # HACK ckpt dir is actually save mini epoch like when you set save_interval_epochs=0.1, this is usefull when you training large dataset
    ckpt_dir2 = os.path.join(FLAGS.model_dir, 'ckpt2')
    os.system('mkdir -p %s' % ckpt_dir2)

    #TODO FIXME now I just changed tf code so to not by default save only latest 5
    # refer to https://github.com/tensorflow/tensorflow/issues/22036
    # manager = tf.contrib.checkpoint.CheckpointManager(
    #     checkpoint, directory=ckpt_dir, max_to_keep=5)
    # latest_checkpoint = manager.latest_checkpoint

    latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir)
    if latest_checkpoint:
        logging.info('Latest checkpoint:', latest_checkpoint)
    else:
        latest_checkpoint = tf.train.latest_checkpoint(ckpt_dir2)
        logging.info('Latest checkpoint:', latest_checkpoint)

    if os.path.exists(FLAGS.model_dir + '.index'):
        latest_checkpoint = FLAGS.model_dir

    if 'test' in FLAGS.work_mode or 'valid' in FLAGS.work_mode:
        #assert not os.path.isdir(FLAGS.model_dir), FLAGS.model_dir
        latest_checkpoint = FLAGS.model_dir
        #assert os.path.exists(latest_checkpoint) and os.path.isfile(latest_checkpoint)

    checkpoint_prefix = os.path.join(ckpt_dir, 'ckpt')
    checkpoint_prefix2 = os.path.join(ckpt_dir2, 'ckpt')

    if not FLAGS.torch:
        try:
            optimizer = optimizer or melt.get_optimizer(
                FLAGS.optimizer)(learning_rate)
        except Exception:
            logging.warning(
                f'Fail to using {FLAGS.optimizer} use adam instead')
            optimizer = melt.get_optimizer('adam')(learning_rate)

        # TODO...
        if learning_rate_weights is None:
            checkpoint = tf.train.Checkpoint(
                learning_rate=learning_rate,
                learning_rate_weight=learning_rate_weight,
                model=model,
                optimizer=optimizer,
                global_step=global_step)
        else:
            checkpoint = tf.train.Checkpoint(
                learning_rate=learning_rate,
                learning_rate_weight=learning_rate_weight,
                learning_rate_weights=learning_rate_weights,
                model=model,
                optimizer=optimizer,
                global_step=global_step)

        checkpoint.restore(latest_checkpoint)
        checkpoint2 = copy.deepcopy(checkpoint)

        start_epoch = int(
            latest_checkpoint.split('-')
            [-1]) if latest_checkpoint and 'ckpt' in latest_checkpoint else 0
        start_step = 0  # TODO
    else:
        # TODO torch with learning rate adjust
        # https://github.com/horovod/horovod/blob/master/examples/pytorch_mnist.py
        # TODO full support for pytorch now not work

        if optimizer is None:
            import lele
            is_dynamic_opt = True
            if FLAGS.optimizer == 'noam':
                optimizer_ = torch.optim.Adamax(model.parameters(), lr=0)
                if use_horovod:
                    optimizer_ = hvd.DistributedOptimizer(optimizer_)
                optimizer = lele.training.optimizers.NoamOpt(
                    128, 2, 4000, optimzier_)
            elif FLAGS.optimizer == 'bert':
                num_train_steps = int(
                    num_steps_per_epoch *
                    (FLAGS.num_decay_epochs or FLAGS.num_epochs))
                if FLAGS.warmup_steps and use_horovod:
                    FLAGS.warmup_steps = max(
                        int(FLAGS.warmup_steps / hvd.size()), 1)
                num_warmup_steps = FLAGS.warmup_steps or int(
                    num_steps_per_epoch * FLAGS.warmup_epochs) or int(
                        num_train_steps * FLAGS.warmup_proportion)
                logging.info('num_train_steps', num_train_steps,
                             'num_warmup_steps', num_warmup_steps,
                             'warmup_proportion', FLAGS.warmup_proportion)
                optimizer_ = torch.optim.Adamax(model.parameters(), lr=0)
                if use_horovod:
                    optimizer_ = hvd.DistributedOptimizer(optimizer_)
                optimizer = lele.training.optimizers.BertOpt(
                    FLAGS.learning_rate, FLAGS.min_learning_rate,
                    num_train_steps, num_warmup_steps, optimizer_)
            else:
                is_dynamic_opt = False
                optimizer = torch.optim.Adamax(
                    param_groups if param_groups else model.parameters(),
                    lr=FLAGS.learning_rate)
                if use_horovod:
                    optimizer = hvd.DistributedOptimizer(optimizer)

        start_epoch = 0
        latest_path = latest_checkpoint + '.pyt' if latest_checkpoint else os.path.join(
            FLAGS.model_dir, 'latest.pyt')
        if not os.path.exists(latest_path):
            latest_path = os.path.join(FLAGS.model_dir, 'latest.pyt')
        if os.path.exists(latest_path):
            logging.info('loading torch model from', latest_path)
            checkpoint = torch.load(latest_path)
            if not FLAGS.torch_finetune:
                start_epoch = checkpoint['epoch']
                step = checkpoint['step']
                global_step.assign(step + 1)
            load_torch_model(model, latest_path)
            if FLAGS.torch_load_optimizer:
                optimizer.load_state_dict(checkpoint['optimizer'])

        # TODO by this way restart can not change learning rate..
        if learning_rate_weights is None:
            checkpoint = tf.train.Checkpoint(
                learning_rate=learning_rate,
                learning_rate_weight=learning_rate_weight,
                global_step=global_step)
        else:
            checkpoint = tf.train.Checkpoint(
                learning_rate=learning_rate,
                learning_rate_weight=learning_rate_weight,
                learning_rate_weights=learning_rate_weights,
                global_step=global_step)

        try:
            checkpoint.restore(latest_checkpoint)
            checkpoint2 = copy.deepcopy(checkpoint)
        except Exception:
            pass

    if FLAGS.torch and is_dynamic_opt:
        optimizer._step = global_step.numpy()

    #model.load_weights(os.path.join(ckpt_dir, 'ckpt-1'))
    #model.save('./weight3.hd5')
    logging.info('optimizer:', optimizer)

    if FLAGS.torch_lr:
        learning_rate.assign(optimizer.rate(1))
    if FLAGS.torch:
        learning_rate.assign(optimizer.param_groups[0]['lr'])
        logging.info('learning rate got from pytorch latest.py as',
                     learning_rate.numpy())

    learning_rate.assign(learning_rate * FLAGS.learning_rate_start_factor)
    if learning_rate_weights is not None:
        learning_rate_weights.assign(learning_rate_weights *
                                     FLAGS.learning_rate_start_factor)

    # TODO currently not support 0.1 epoch.. like this
    num_epochs = FLAGS.num_epochs if FLAGS.num_epochs != 0 else 1024

    will_valid = valid_dataset and not FLAGS.work_mode == 'test' and not 'SHOW' in os.environ and not 'QUICK' in os.environ
    if global_step.numpy() == 0:
        will_valid = False

    if gezi.get_env('EVFIRST') == '1':
        will_valid = True

    if gezi.get_env('EVFIRST') == '0':
        will_valid = False

    if will_valid:
        logging.info('----------valid')
        if hasattr(model, 'eval'):
            model.eval()
        names = None
        if evaluate_fn is not None:
            vals, names = evaluate_fn(model, valid_dataset,
                                      tf.train.latest_checkpoint(ckpt_dir),
                                      num_valid_steps_per_epoch)
        elif eval_fn:
            model_path = None if not write_valid else latest_checkpoint
            names = valid_names if valid_names is not None else [
                infer_names[0]
            ] + [x + '_y' for x in infer_names[1:]
                 ] + infer_names[1:] if infer_names else None

            logging.info('model_path:', model_path, 'model_dir:',
                         FLAGS.model_dir)
            vals, names = evaluate(model,
                                   valid_dataset,
                                   eval_fn,
                                   model_path,
                                   names,
                                   valid_write_fn,
                                   write_streaming,
                                   num_valid_steps_per_epoch,
                                   num_valid_examples,
                                   suffix=valid_suffix,
                                   sep=sep)
        if names:
            logging.info2(
                'epoch:%.2f/%d step:%d' %
                (global_step.numpy() / num_steps_per_epoch, num_epochs,
                 global_step.numpy()),
                ['%s:%.4f' % (name, val) for name, val in zip(names, vals)])

        if FLAGS.work_mode == 'valid' or gezi.get_env('METRIC') == '1':
            exit(0)

    if 'test' in FLAGS.work_mode or gezi.get_env(
            'TEST') == '1' or gezi.get_env('INFER') == '1':
        logging.info('--------test/inference')
        if test_dataset:
            if hasattr(model, eval):
                model.eval()
            if inference_fn is None:
                # model_path = FLAGS.model_dir + '.pyt' if not latest_checkpoint else latest_checkpoint
                # logging.info('model_path', model_path)
                assert latest_checkpoint
                inference(model,
                          test_dataset,
                          latest_checkpoint,
                          infer_names,
                          infer_debug_names,
                          infer_write_fn,
                          write_streaming,
                          num_test_steps_per_epoch,
                          num_test_examples,
                          suffix=infer_suffix)
            else:
                inference_fn(model, test_dataset,
                             tf.train.latest_checkpoint(ckpt_dir),
                             num_test_steps_per_epoch)
        exit(0)

    if 'SHOW' in os.environ:
        num_epochs = start_epoch + 1

    class PytObj(object):
        def __init__(self, x):
            self.x = x

        def numpy(self):
            return self.x

    class PytMean(object):
        def __init__(self):
            self._val = 0.
            self.count = 0

            self.is_call = True

        def clear(self):
            self._val = 0
            self.count = 0

        def __call__(self, val):
            if not self.is_call:
                self.clear()
                self.is_call = True
            self._val += val.item()
            self.count += 1

        def result(self):
            if self.is_call:
                self.is_call = False
            if not self.count:
                val = 0
            else:
                val = self._val / self.count
            # TODO just for compact with tf ..
            return PytObj(val)

    Mean = tfe.metrics.Mean if not FLAGS.torch else PytMean

    num_insts = 0

    if FLAGS.learning_rate_decay_factor > 0:
        #assert FLAGS.learning_rate_values is None, 'use exponential_decay or piecewise_constant?'
        #NOTICE if you do finetune or other things which might change batch_size then you'd better direclty set num_steps_per_decay
        #since global step / decay_steps will not be correct epoch as num_steps per epoch changed
        #so if if you change batch set you have to reset global step as fixed step
        assert FLAGS.num_steps_per_decay or (
            FLAGS.num_epochs_per_decay and num_steps_per_epoch
        ), 'must set num_steps_per_epoch or num_epochs_per_decay and num_steps_per_epoch'
        decay_steps = FLAGS.num_steps_per_decay or int(
            num_steps_per_epoch * FLAGS.num_epochs_per_decay)
        decay_start_step = FLAGS.decay_start_step or int(
            num_steps_per_epoch * FLAGS.decay_start_epoch)
        # decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
        logging.info(
            'learning_rate_decay_factor:{} decay_epochs:{} decay_steps:{} decay_start_epoch:{} decay_start_step:{}'
            .format(FLAGS.learning_rate_decay_factor,
                    FLAGS.num_epochs_per_decay, decay_steps,
                    FLAGS.decay_start_epoch, decay_start_step))

    #-------------------------start training
    if hasattr(model, 'train'):
        model.train()

    timer = gezi.Timer()
    loss_avg = Mean()
    valid_loss_avg = Mean()

    num_epochs = num_epochs if num_epochs else 0
    loops = min(num_epochs, 1) if FLAGS.torch_only else 1
    for _ in range(loops):
        for i, (x, y) in enumerate(train_dataset):
            #print('-------------------', i)
            print(len(x['index']), len(x['value']), len(x['id']))
            print(x['index'][0].size(), x['index'][1].size(), y.size())
            print(x['value'][0].size(), x['value'][1].size(), y.size())
            print(x['id'][0], x['id'][1], y.size())
            if i == 3:
                exit(0)
            continue

            if FLAGS.torch:
                x, y = to_torch(x, y)
                if is_dynamic_opt:
                    learning_rate.assign(optimizer.rate())

            def loss_fn_(x, y):
                if not FLAGS.torch and 'training' in inspect.getargspec(
                        model.call).args:
                    y_ = model(x, training=True)
                else:
                    y_ = model(x)
                if not FLAGS.torch:
                    return loss_fn(y, y_)
                else:
                    return loss_fn(y_, y)

            if not FLAGS.torch:
                loss, grads = melt.eager.grad(model, x, y, loss_fn)
                grads, _ = tf.clip_by_global_norm(grads, FLAGS.clip_gradients)
                #optimizer.apply_gradients(zip(grads, model.variables))
                optimizer.apply_gradients(zip(grads,
                                              model.trainable_variables))
                # https://github.com/horovod/horovod/blob/master/examples/tensorflow_mnist_eager.py
                # Horovod: broadcast initial variable states from rank 0 to all other processes.
                # This is necessary to ensure consistent initialization of all workers when
                # training is started with random weights or restored from a checkpoint.
                # Note: broadcast should be done after the first gradient step to ensure optimizer
                # initialization.
                # TODO check eager mode
                if use_horovod and epoch == start_epoch and i == 0:
                    hvd.broadcast_variables(model.variables, root_rank=0)
                    hvd.broadcast_variables(optimizier.variables(),
                                            root_rank=0)
            else:
                optimizer.zero_grad()
                loss = loss_fn_(x, y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               FLAGS.clip_gradients)
                optimizer.step()

            global_step.assign_add(1)
            loss_avg(loss)

            ## https://discuss.pytorch.org/t/calling-loss-backward-reduce-memory-usage/2735
            # if FLAGS.torch:
            #   del loss

            batch_size_ = list(
                x.values())[0].shape[FLAGS.batch_size_dim] if type(x) == type(
                    {}) else x.shape[FLAGS.batch_size_dim]
            num_insts += int(batch_size_)
            if global_step.numpy() % FLAGS.interval_steps == 0:
                #checkpoint.save(checkpoint_prefix)
                elapsed = timer.elapsed()
                steps_per_second = FLAGS.interval_steps / elapsed
                instances_per_second = num_insts / elapsed
                num_insts = 0

                if num_steps_per_epoch is None:
                    epoch_time_info = ''
                else:
                    hours_per_epoch = num_steps_per_epoch / FLAGS.interval_steps * elapsed / 3600
                    epoch_time_info = '1epoch:[{:.2f}h]'.format(
                        hours_per_epoch)

                if valid_dataset2:
                    # try:
                    #   x, y = next(iter(valid_dataset2))
                    # except Exception:
                    #   # TODO FIXME how.. iterate stop restart.., here hack for my iterator see projects/lm/dataset
                    #   x, y = next(iter(valid_dataset2))
                    ## valid dataset2 is repeated
                    ## NOTICE will always the first batch ... as below
                    #x, y = next(iter(valid_dataset2))
                    x, y = next(valid_dataset2_iter)
                    #print(x['id'][0])
                    if FLAGS.torch:
                        x, y = to_torch(x, y)
                    if hasattr(model, 'eval'):
                        model.eval()
                    valid_loss = loss_fn_(x, y)
                    valid_loss = valid_loss.numpy(
                    ) if not FLAGS.torch else valid_loss.item()
                    if hasattr(model, 'train'):
                        model.train()

                    if not use_horovod or hvd.rank() == 0:
                        # 'train_loss:[%.4f]' % loss_avg.result().numpy(),
                        # 'valid_loss:[%.4f]' % valid_loss_avg.result().numpy()
                        logging.info2(
                            'epoch:%.2f/%d' %
                            ((global_step.numpy() / num_steps_per_epoch),
                             num_epochs), 'step:%d' % global_step.numpy(),
                            'elapsed:[%.2f]' % elapsed, 'batch_size:[%d]' %
                            batch_size_, 'gpus:[%d]' % num_gpus,
                            'batches/s:[%.2f]' % steps_per_second,
                            'insts/s:[%d]' % instances_per_second,
                            '%s' % epoch_time_info,
                            'lr:[%.6f]' % learning_rate.numpy(),
                            'train_loss:[%.4f]' % loss_avg.result().numpy(),
                            'valid_loss:[%.4f]' % valid_loss)
                        if global_step.numpy(
                        ) % FLAGS.valid_interval_steps == 0:
                            with writer_valid.as_default(
                            ), summary.always_record_summaries():
                                summary.scalar('loss/valid', valid_loss)
                                writer_valid.flush()
                else:
                    if not use_horovod or hvd.rank() == 0:
                        #'train_loss:[%.4f]' % loss_avg.result().numpy()
                        logging.info2(
                            'epoch:%.2f/%d' %
                            ((epoch + i / num_steps_per_epoch), num_epochs),
                            'step:%d' % global_step.numpy(), 'elapsed:[%.2f]' %
                            elapsed, 'batch_size:[%d]' % batch_size_,
                            'gpus:[%d]' % num_gpus,
                            'batches/s:[%.2f]' % steps_per_second,
                            'insts/s:[%d]' % instances_per_second,
                            '%s' % epoch_time_info,
                            'lr:[%.6f]' % learning_rate.numpy(),
                            'train_loss:[%.4f]' % loss_avg.result().numpy())

                if not use_horovod or hvd.rank() == 0:
                    if global_step.numpy() % FLAGS.valid_interval_steps == 0:
                        with writer_train.as_default(
                        ), summary.always_record_summaries():
                            summary.scalar('loss/train_avg',
                                           loss_avg.result().numpy())
                            summary.scalar('learning_rate',
                                           learning_rate.numpy())
                            summary.scalar('other/batch_size', batch_size_)
                            summary.scalar('other/epoch', melt.epoch())
                            summary.scalar('perf/steps_per_second',
                                           steps_per_second)
                            summary.scalar('perf/instances_per_second',
                                           instances_per_second)
                            writer_train.flush()

            if valid_dataset and FLAGS.metric_eval_interval_steps and global_step.numpy(
            ) and global_step.numpy() % FLAGS.metric_eval_interval_steps == 0:
                if hasattr(model, eval):
                    model.eval()
                vals, names = None, None
                if evaluate_fn is not None:
                    vals, names = evaluate_fn(model, valid_dataset, None,
                                              num_valid_steps_per_epoch)
                elif eval_fn:
                    names = valid_names if valid_names is not None else [
                        infer_names[0]
                    ] + [x + '_y' for x in infer_names[1:]
                         ] + infer_names[1:] if infer_names else None
                    vals, names = evaluate(model,
                                           valid_dataset,
                                           eval_fn,
                                           None,
                                           names,
                                           valid_write_fn,
                                           write_streaming,
                                           num_valid_steps_per_epoch,
                                           num_valid_examples,
                                           sep=sep)
                if not use_horovod or hvd.rank() == 0:
                    if vals and names:
                        with writer_valid.as_default(
                        ), summary.always_record_summaries():
                            for name, val in zip(names, vals):
                                summary.scalar(f'step_eval/{name}', val)
                            writer_valid.flush()

                if FLAGS.torch:
                    if not FLAGS.torch_lr:
                        # control learning rate by tensorflow learning rate
                        for param_group in optimizer.param_groups:
                            # important learning rate decay
                            param_group['lr'] = learning_rate.numpy()
                if hasattr(model, 'train'):
                    model.train()
                if not use_horovod or hvd.rank() == 0:
                    if names and vals:
                        logging.info2(
                            'epoch:%.2f/%d' %
                            ((global_step.numpy() / num_steps_per_epoch),
                             num_epochs),
                            'valid_step:%d' % global_step.numpy(),
                            'valid_metrics', [
                                '%s:%.5f' % (name, val)
                                for name, val in zip(names, vals)
                            ])

            if not use_horovod or hvd.rank() == 0:
                # TODO save ok ?
                if global_step.numpy() % FLAGS.save_interval_steps == 0:
                    if FLAGS.torch:
                        state = {
                            'epoch':
                            int(global_step.numpy() / num_steps_per_epoch),
                            'step':
                            global_step.numpy(),
                            'state_dict':
                            model.state_dict() if not hasattr(model, 'module')
                            else model.module.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                        }
                        torch.save(state,
                                   os.path.join(FLAGS.model_dir, 'latest.pyt'))

                # TODO fixme why if both checpoint2 and chekpoint used... not ok..
                if FLAGS.save_interval_epochs and global_step.numpy() % int(
                        num_steps_per_epoch * FLAGS.save_interval_epochs) == 0:
                    checkpoint2.save(checkpoint_prefix2)
                    if FLAGS.torch:
                        state = {
                            'epoch':
                            int(global_step.numpy() / num_steps_per_epoch),
                            'step':
                            global_step.numpy(),
                            'state_dict':
                            model.state_dict() if not hasattr(model, 'module')
                            else model.module.state_dict(),
                            'optimizer':
                            optimizer.state_dict(),
                        }
                        torch.save(
                            state,
                            tf.train.latest_checkpoint(ckpt_dir2) + '.pyt')

            if FLAGS.learning_rate_decay_factor > 0:
                if global_step.numpy(
                ) >= decay_start_step and global_step.numpy(
                ) % decay_steps == 0:
                    lr = max(
                        learning_rate.numpy() *
                        FLAGS.learning_rate_decay_factor,
                        FLAGS.min_learning_rate)
                    if lr < learning_rate.numpy():
                        learning_rate.assign(lr)
                        if FLAGS.torch:
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = learning_rate.numpy()

            if i == 0:
                try:
                    if not FLAGS.torch:
                        logging.info(model.summary())
                        # #tf.keras.utils.plot_model(model, to_file='/home/gezi/model.png', show_shapes=False, show_layer_names=True, rankdir='TB')
                        # import keras
                        # keras.utils.plot_model(model, to_file='/home/gezi/model.png', show_shapes=False, show_layer_names=True, rankdir='LR', expand_nested=True, dpi=96)
                    else:
                        logging.info(model)
                except Exception:
                    traceback.print_exc()
                    logging.info(
                        'Fail to do model.summary() may be you have layer define in init but not used in call'
                    )
                if 'SHOW' in os.environ:
                    exit(0)

            if valid_dataset and global_step.numpy() % int(
                    num_steps_per_epoch * FLAGS.valid_interval_epochs) == 0:
                if hasattr(model, 'eval'):
                    model.eval()

                vals, names = None, None
                if evaluate_fn is not None:
                    vals, names = evaluate_fn(
                        model, valid_dataset,
                        tf.train.latest_checkpoint(ckpt_dir),
                        num_valid_steps_per_epoch)
                elif eval_fn:
                    model_path = None if not write_valid else tf.train.latest_checkpoint(
                        ckpt_dir)
                    print('---------metric evaluate step', global_step.numpy(),
                          'model_path:', model_path)
                    names = valid_names if valid_names is not None else [
                        infer_names[0]
                    ] + [x + '_y' for x in infer_names[1:]
                         ] + infer_names[1:] if infer_names else None

                    vals, names = evaluate(model,
                                           valid_dataset,
                                           eval_fn,
                                           model_path,
                                           names,
                                           valid_write_fn,
                                           write_streaming,
                                           num_valid_steps_per_epoch,
                                           num_valid_examples,
                                           suffix=valid_suffix,
                                           sep=sep)

                if not use_horovod or hvd.rank() == 0:
                    if vals and names:
                        logging.info2(
                            'epoch:%.2f/%d' %
                            (global_step.numpy() / num_steps_per_epoch,
                             num_epochs), 'step:%d' % global_step.numpy(),
                            'valid_metrics', [
                                '%s:%.5f' % (name, val)
                                for name, val in zip(names, vals)
                            ])

                if not use_horovod or hvd.rank() == 0:
                    with writer.as_default(), summary.always_record_summaries(
                    ):
                        temp = global_step.value()
                        global_step.assign(
                            int(global_step.numpy() /
                                int(num_steps_per_epoch *
                                    FLAGS.valid_interval_epochs)))
                        if valid_dataset:
                            if hasattr(model, 'eval'):
                                model.eval()
                            if vals and names:
                                for name, val in zip(names, vals):
                                    summary.scalar(f'eval/{name}', val)
                        writer.flush()
                        global_step.assign(temp)

            if test_dataset and global_step.numpy() % int(
                    num_steps_per_epoch *
                    FLAGS.inference_interval_epochs) == 0:
                if hasattr(model, 'eval'):
                    model.eval()
                if inference_fn is None:
                    inference(model,
                              test_dataset,
                              tf.train.latest_checkpoint(ckpt_dir),
                              infer_names,
                              infer_debug_names,
                              infer_write_fn,
                              write_streaming,
                              num_test_steps_per_epoch,
                              num_test_examples,
                              suffix=infer_suffix,
                              sep=sep)
                else:
                    inference_fn(model, test_dataset,
                                 tf.train.latest_checkpoint(ckpt_dir),
                                 num_test_steps_per_epoch)

            if num_epochs and (global_step.numpy() %
                               num_steps_per_epoch) == 0 and int(
                                   global_step.numpy() /
                                   num_steps_per_epoch) == num_epochs:
                logging.info(f'Finshed training of {num_epochs} epochs')
                exit(0)
Beispiel #18
0
def main(_):
    hvd.init()
    os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.rank())
    if hvd.rank() == 0:
        tf.logging.set_verbosity(tf.logging.INFO)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    tf.gfile.MakeDirs(FLAGS.output_dir)
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file)
    label_list = tokenizer.get_labels()

    model_dir = FLAGS.output_dir if hvd.rank() == 0 else None
    run_config = tf.estimator.RunConfig(
        model_dir=model_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        log_step_count_steps=10)

    train_examples = None
    num_train_steps = 0
    num_warmup_steps = 0

    if FLAGS.do_train:
        train_examples = read_datagrand_examples(input_file=FLAGS.data_dir +
                                                 FLAGS.train_file,
                                                 tokenizer=tokenizer,
                                                 has_label=True)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_epochs_per_eval)
        num_train_steps = num_train_steps // hvd.size()
        num_warmup_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_epochs_warmup)
        num_warmup_steps = num_warmup_steps // hvd.size()
        # different gpu has different shuffled data
        rng = random.Random(hvd.rank())
        rng.shuffle(train_examples)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                model_dir=model_dir,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir,
                                  "train_shard{}.tf_record".format(hvd.rank()))
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            batch_size=FLAGS.train_batch_size,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)

        # prepare eval dataset
        if hvd.rank() == 0:
            eval_gold_examples = read_datagrand_examples(
                input_file=FLAGS.data_dir + FLAGS.eval_gold_file,
                tokenizer=tokenizer,
                has_label=True)
            num_actual_eval_examples = len(eval_gold_examples)

            eval_gold_file = os.path.join(FLAGS.output_dir,
                                          "eval_gold.tf_record")
            file_based_convert_examples_to_features(eval_gold_examples,
                                                    label_list,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, eval_gold_file)

            eval_examples = read_datagrand_examples(input_file=FLAGS.data_dir +
                                                    FLAGS.eval_file,
                                                    tokenizer=tokenizer,
                                                    has_label=False)

            eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
            file_based_convert_examples_to_features(eval_examples,
                                                    label_list,
                                                    FLAGS.max_seq_length,
                                                    tokenizer,
                                                    eval_file,
                                                    has_label=False)

        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
        f1_history = []
        max_train_iter = 3
        train_iter = 0
        while train_iter < max_train_iter:
            # early stopping condition:
            # f1 score doesn't increase within 5 iterations
            estimator.train(input_fn=train_input_fn,
                            steps=num_train_steps,
                            hooks=[bcast_hook])
            train_iter += 1
            # evaluation
            if hvd.rank() == 0:
                tf.logging.info("***** Running evaluation *****")
                tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                                len(eval_examples), num_actual_eval_examples,
                                len(eval_examples) - num_actual_eval_examples)
                tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
                # This tells the estimator to run through the entire set.
                eval_steps = None
                eval_input_fn = file_based_input_fn_builder(
                    input_file=eval_gold_file,
                    batch_size=FLAGS.eval_batch_size,
                    seq_length=FLAGS.max_seq_length,
                    is_training=False,
                    drop_remainder=False)

                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps)
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))

                eval_input_fn = file_based_input_fn_builder(
                    input_file=eval_file,
                    batch_size=FLAGS.eval_batch_size,
                    seq_length=FLAGS.max_seq_length,
                    is_training=False,
                    drop_remainder=False)

                results = estimator.predict(input_fn=eval_input_fn)

                output_predict_file = os.path.join(FLAGS.output_dir,
                                                   "eval_predictions.txt")
                from_predictions_to_file(eval_examples, results,
                                         output_predict_file, tokenizer)

                metrics, metrics_val = compute_metrics(
                    output_predict_file, FLAGS.data_dir + FLAGS.eval_gold_file)

                tf.logging.info("eval predictions result: ")
                tf.logging.info("{}: {:.2f}".format(metrics, metrics_val))
                f1_history.append(metrics_val)
                if len(f1_history) >= 5 and f1_history[-1] <= f1_history[-5]:
                    tf.logging.info("***** training converges ******")
                    break
    """  
Beispiel #19
0
def main(argv):

    # read args from command `mpirun ... python hvd_run_mnist_training.py inputFilePath outputFilePath`
    inputFilePath = argv[1]
    #outputFilePath = argv[2]
    exportModelDir = argv[2]

    # Horovod: initialize Horovod.
    hvd.init()

    # Load training and eval data
    table = load_pyarrow_table(inputFilePath)

    # later I will change code to avoid using `to_pandas`
    pdf = table.to_pandas()
    train_data = np.reshape(
        np.array(np.concatenate(pdf['features']), dtype=np.float32), (-1, 784))
    train_labels = np.array(pdf['label'], dtype=np.float32)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir = './mnist_convnet_model_' + str(random.randint(0, 2<<30))\
        if hvd.rank() == 0 else None

    # Create the Estimator
    mnist_classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        model_dir=model_dir,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Train the model
    train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data},
                                                        y=train_labels,
                                                        batch_size=100,
                                                        num_epochs=None,
                                                        shuffle=True)

    # Horovod: adjust number of steps based on number of GPUs.
    mnist_classifier.train(input_fn=train_input_fn,
                           steps=5 // hvd.size(),
                           hooks=[logging_hook, bcast_hook])
    """
    feature_x = tf.feature_column.numeric_column("x", [784])
    feature_columns = [feature_x]
    feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
    serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
    """
    """
    def serving_input_receiver_fn():
        serialized_tf_example = tf.placeholder(dtype=tf.string,
                                               shape=[None],
                                               name='input_tensors')
        receiver_tensors = {'inputs': serialized_tf_example}
        feature_spec = {'x': tf.FixedLenFeature([784],tf.float32)}
        features = tf.parse_example(serialized_tf_example, feature_spec)
        return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
    """
    """
    with open(outputFilePath, "w") as f:
        varlist = mnist_classifier.get_variable_names()
        vars = {}
        for var in varlist:
            vars[var] = mnist_classifier.get_variable_value(var).tolist()
        # the result is large (135MB). only store keys to output file for now.
        f.write(str(varlist))
    """
    def serving_input_receiver_fn():
        # The outer dimension (None) allows us to batch up inputs for
        # efficiency. However, it also means that if we want a prediction
        # for a single instance, we'll need to wrap it in an outer list.
        inputs = {"x": tf.placeholder(shape=[None, 784], dtype=tf.float32)}
        return tf.estimator.export.ServingInputReceiver(inputs, inputs)

    mnist_classifier.export_savedmodel(exportModelDir,
                                       serving_input_receiver_fn)
def main(unused_argv):
    # Horovod: initialize Horovod.
    hvd.init()
    filename_train = get_filenames(True, data_dir)
    filename_test = get_filenames(False, data_dir)

    # Load training and eval data
    #mnist = learn.datasets.mnist.read_data_sets('MNIST-data-%d' % hvd.rank())
    #train_data = mnist.train.images  # Returns np.array
    #train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
    #eval_data = mnist.test.images  # Returns np.array
    #eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting them.
    model_dir1 = model_dir if hvd.rank() == 0 else None

    # Create the Estimator
    classifier = tf.estimator.Estimator(
        model_fn=cnn_model_fn,
        model_dir=model_dir1,
        config=tf.estimator.RunConfig(session_config=config))

    # Set up logging for predictions
    # Log the values in the "Softmax" tensor with label "probabilities"
    tensors_to_log = {"probabilities": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=500)

    # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
    # rank 0 to all other processes. This is necessary to ensure consistent
    # initialization of all workers when training is started with random weights or
    # restored from a checkpoint.
    bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
    input_function = input_fn

    # Train the model
    def input_fn_train():
        return input_function(is_training=True,
                              data_dir=data_dir,
                              batch_size=batch_size,
                              filenames=filename_train,
                              num_epochs=epochs_between_evals)

    # Horovod: adjust number of steps based on number of GPUs.
    classifier.train(input_fn=input_fn_train,
                     steps=20000 // hvd.size(),
                     hooks=[logging_hook, bcast_hook])

    # Evaluate the model and print results

    def input_fn_eval():
        return input_function(is_training=False,
                              data_dir=flags_obj.data_dir,
                              batch_size=batch_size,
                              filenames=filename_test,
                              num_epochs=1)

    eval_results = classifier.evaluate(input_fn=input_fn_eval)
    print(eval_results)
Beispiel #21
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if FLAGS.use_fp16:
        os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"

    if FLAGS.horovod:
        import horovod.tensorflow as hvd
        hvd.init()

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    config = tf.ConfigProto()
    if FLAGS.horovod:
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        if len(input_files) < hvd.size():
            raise ValueError("Input Files must be sharded")
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    config = tf.ConfigProto()
    if FLAGS.horovod:
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        config.gpu_options.allow_growth = True


#    config.gpu_options.per_process_gpu_memory_fraction = 0.7
    if FLAGS.use_xla:
        config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        session_config=config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps
        if not FLAGS.horovod or hvd.rank() == 0 else None,
        # This variable controls how often estimator reports examples/sec.
        # Default value is every 100 steps.
        # When --report_loss is True, we set to very large value to prevent
        # default info reporting from estimator.
        # Ideally we should set it to None, but that does not work.
        log_step_count_steps=10000 if FLAGS.report_loss else 100)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate
                                if not FLAGS.horovod else FLAGS.learning_rate *
                                hvd.size(),
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_one_hot_embeddings=False,
                                hvd=None if not FLAGS.horovod else hvd)

    training_hooks = []
    if FLAGS.horovod and hvd.size() > 1:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
    if FLAGS.report_loss:
        global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size(
        )
        training_hooks.append(
            _LogSessionRunHook(global_batch_size, 1,
                               -1 if not FLAGS.horovod else hvd.rank()))

    training_hooks = []
    if FLAGS.report_loss and (not FLAGS.horovod or hvd.rank() == 0):
        global_batch_size = FLAGS.train_batch_size if not FLAGS.horovod else FLAGS.train_batch_size * hvd.size(
        )
        training_hooks.append(_LogSessionRunHook(global_batch_size, 100))
    if FLAGS.horovod:
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            batch_size=FLAGS.train_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True,
            hvd=None if not FLAGS.horovod else hvd)
        estimator.train(input_fn=train_input_fn,
                        hooks=training_hooks,
                        max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval and (not FLAGS.horovod or hvd.rank() == 0):
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            batch_size=FLAGS.eval_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False,
            hvd=None if not FLAGS.horovod else hvd)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Beispiel #22
0
def main(_):
    # Horovod: initialize Horovod.
    hvd.init()

    os.environ['KMP_SETTINGS'] = str(1)
    os.environ['KMP_BLOCKTIME'] = str(0)
    os.environ['OMP_NUM_THREADS'] = str(threads)
    os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0'

    config = tf.ConfigProto()
    config.intra_op_parallelism_threads = threads
    config.inter_op_parallelism_threads = pools
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    tf.enable_eager_execution(config=config)

    mnist_model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.Conv2D(16, [3, 3], activation='relu'),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(10)
    ])

    # Horovod: adjust learning rate based on number of GPUs.
    opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())

    (mnist_images, mnist_labels), _ = \
        tf.keras.datasets.mnist.load_data(path=os.path.join(args.datadir, 'mnist.npz'))

    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
                 tf.float32), tf.cast(mnist_labels, tf.int64)))
    dataset = dataset.shuffle(1000).batch(args.batch_size)

    checkpoint_dir = os.path.join(args.modeldir, 'checkpoints')
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tf.train.Checkpoint(model=mnist_model,
                                     optimizer=opt,
                                     step_counter=step_counter)

    # Horovod: adjust number of steps based on number of GPUs.
    for (batch, (images,
                 labels)) in enumerate(dataset.take(2000 // hvd.size())):
        with tf.GradientTape() as tape:
            logits = mnist_model(images, training=True)
            loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits)

        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        if batch == 0:
            hvd.broadcast_variables(mnist_model.variables, root_rank=0)

        # Horovod: add Horovod Distributed GradientTape.
        tape = hvd.DistributedGradientTape(tape)

        grads = tape.gradient(loss_value, mnist_model.variables)
        opt.apply_gradients(zip(grads, mnist_model.variables),
                            global_step=tf.train.get_or_create_global_step())

        if batch % 10 == 0 and hvd.local_rank() == 0:
            print('Step #%d\tLoss: %.6f' % (batch, loss_value))

    # Horovod: save checkpoints only on worker 0 to prevent other workers from
    # corrupting it.
    if hvd.rank() == 0:
        checkpoint.save(checkpoint_dir)
Beispiel #23
0
def main():
    # Parse essential args
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        required=True,
                        help="Location of data files (model weights, etc).")
    parser.add_argument("--model_name",
                        required=True,
                        help="The name of the model being fine-tuned.")
    parser.add_argument("--pretrain_tfrecords", type=str)

    parser.add_argument("--seed", type=int)
    parser.add_argument("--num_train_steps", type=int)
    parser.add_argument("--num_warmup_steps", type=int)
    parser.add_argument("--learning_rate", type=float)
    parser.add_argument("--train_batch_size", type=int)
    parser.add_argument("--max_seq_length", type=int)

    parser.add_argument("--mask_prob", type=float)
    parser.add_argument("--disc_weight", type=float)
    parser.add_argument("--generator_hidden_size", type=float)

    parser.add_argument("--save_checkpoints_steps", type=int)
    parser.add_argument("--keep_checkpoint_max", type=int)
    parser.add_argument("--restore_checkpoint", action='store_true')

    parser.add_argument("--optimizer",
                        default="adam",
                        type=str,
                        help="adam or lamb")

    args = parser.parse_args()
    config = PretrainingConfig(**args.__dict__)

    # Set up tensorflow
    hvd.init()
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
    tf.config.optimizer.set_jit(config.xla)
    tf.config.optimizer.set_experimental_options(
        {"auto_mixed_precision": config.amp})
    tf.random.set_seed(config.seed)

    # Set up config
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    log_config(config)

    # Save pretrain configs
    pretrain_config_json = os.path.join(config.checkpoints_dir,
                                        'pretrain_config.json')
    if is_main_process():
        utils.write_json(config.__dict__, pretrain_config_json)
        log("Configuration saved in {}".format(pretrain_config_json))

    # Set up model
    model = PretrainingModel(config)

    # Set up metrics
    perf_metrics = dict()
    perf_metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")

    eval_metrics = dict()
    eval_metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
    eval_metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
        name="masked_lm_accuracy")
    eval_metrics["masked_lm_loss"] = tf.keras.metrics.Mean(
        name="masked_lm_loss")
    if config.electra_objective:
        eval_metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
            name="sampled_masked_lm_accuracy")
        if config.disc_weight > 0:
            eval_metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
            eval_metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
            eval_metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(
                name="disc_accuracy")
            eval_metrics["disc_precision"] = tf.keras.metrics.Accuracy(
                name="disc_precision")
            eval_metrics["disc_recall"] = tf.keras.metrics.Accuracy(
                name="disc_recall")

    # Set up tensorboard
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = os.path.join(
        config.log_dir, current_time,
        'train_' + str(get_rank()) + '_of_' + str(get_world_size()))
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Set up dataset
    dataset = pretrain_utils.get_dataset(config,
                                         config.train_batch_size,
                                         world_size=get_world_size(),
                                         rank=get_rank())
    train_iterator = iter(dataset)

    # Set up optimizer
    optimizer = create_optimizer(init_lr=config.learning_rate,
                                 num_train_steps=config.num_train_steps,
                                 num_warmup_steps=config.num_warmup_steps,
                                 weight_decay_rate=config.weight_decay_rate,
                                 optimizer=config.optimizer)
    if config.amp:
        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, "dynamic")

    if config.do_train:
        # Set up checkpoint manager
        checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
                                         optimizer=optimizer,
                                         model=model)
        manager = tf.train.CheckpointManager(
            checkpoint,
            config.checkpoints_dir,
            max_to_keep=config.keep_checkpoint_max)
        iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator)
        iter_manager = tf.train.CheckpointManager(
            iter_checkpoint,
            os.path.join(config.checkpoints_dir,
                         'iter_ckpt_rank_' + '{:02}'.format(get_rank())),
            checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()),
            max_to_keep=config.keep_checkpoint_max)
        if config.restore_checkpoint and manager.latest_checkpoint:
            checkpoint.restore(manager.latest_checkpoint)
            log(" ** Restored model checkpoint from {}".format(
                manager.latest_checkpoint))
            if iter_manager.latest_checkpoint:
                iter_checkpoint.restore(iter_manager.latest_checkpoint)
                log(" ** Restored iterator checkpoint from {}".format(
                    iter_manager.latest_checkpoint),
                    all_rank=True)
        else:
            log(" ** Initializing from scratch.")

        utils.heading("Running training")
        train_start, start_step = time.time(), int(checkpoint.step) - 1
        while int(checkpoint.step) <= config.num_train_steps:
            step = int(checkpoint.step)
            features = next(train_iterator)
            iter_start = time.time()

            # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
            total_loss, eval_fn_inputs = train_one_step(
                config, model, optimizer, features, step <= 1)
            # if step == 300: tf.profiler.experimental.stop()

            perf_metrics["train_perf"].update_state(config.train_batch_size *
                                                    get_world_size() /
                                                    (time.time() - iter_start))
            eval_metrics["total_loss"].update_state(values=total_loss)
            metric_fn(config, eval_metrics, eval_fn_inputs)

            if step % 100 == 0:
                log('Step:{:6d}, Loss:{:10.6f}, Gen_loss:{:10.6f}, Disc_loss:{:10.6f}, Gen_acc:{:6.2f}, '
                    'Disc_acc:{:6.2f}, Perf:{:4.0f}, Elapsed: {}, ETA: {}, '.
                    format(
                        step, total_loss,
                        eval_metrics["masked_lm_loss"].result().numpy(),
                        eval_metrics["disc_loss"].result().numpy(),
                        eval_metrics["masked_lm_accuracy"].result().numpy() *
                        100,
                        eval_metrics["disc_accuracy"].result().numpy() * 100,
                        perf_metrics["train_perf"].result().numpy(),
                        utils.get_readable_time(time.time() - train_start),
                        utils.get_readable_time(
                            (time.time() - train_start) / (step - start_step) *
                            (config.num_train_steps - step))),
                    all_rank=True)

                with train_summary_writer.as_default():
                    for key, m in eval_metrics.items():
                        tf.summary.scalar(key, m.result(), step=step)

                for m in eval_metrics.values():
                    m.reset_states()

            checkpoint.step.assign_add(1)
            if step % config.save_checkpoints_steps == 0:
                if is_main_process():
                    save_path = manager.save()
                    log(" ** Saved model checkpoint for step {}: {}".format(
                        step, save_path))
                iter_save_path = iter_manager.save()
                log(" ** Saved iterator checkpoint for step {}: {}".format(
                    step, iter_save_path),
                    all_rank=True)

    if config.do_eval:
        pass
Beispiel #24
0
def main():
  # Parse args and create config
  args, base_config, base_model, config_module = get_base_config(sys.argv[1:])

  if args.mode == "interactive_infer":
    raise ValueError(
        "Interactive infer is meant to be run from an IPython",
        "notebook not from run.py."
    )

#   restore_best_checkpoint = base_config.get('restore_best_checkpoint', False)
#   # Check logdir and create it if necessary
#   checkpoint = check_logdir(args, base_config, restore_best_checkpoint)

  load_model = base_config.get('load_model', None)
  restore_best_checkpoint = base_config.get('restore_best_checkpoint', False)
  base_ckpt_dir = check_base_model_logdir(load_model, args, restore_best_checkpoint)
  base_config['load_model'] = base_ckpt_dir

  # Check logdir and create it if necessary
  checkpoint = check_logdir(args, base_config, restore_best_checkpoint)

  # Initilize Horovod
  if base_config['use_horovod']:
    import horovod.tensorflow as hvd
    hvd.init()
    if hvd.rank() == 0:
      deco_print("Using horovod")
    from mpi4py import MPI
    MPI.COMM_WORLD.Barrier()
  else:
    hvd = None

  if args.enable_logs:
    if hvd is None or hvd.rank() == 0:
      old_stdout, old_stderr, stdout_log, stderr_log = create_logdir(
          args,
          base_config
      )
    base_config['logdir'] = os.path.join(base_config['logdir'], 'logs')

  if args.mode == 'train' or args.mode == 'train_eval' or args.benchmark:
    if hvd is None or hvd.rank() == 0:
      if checkpoint is None or args.benchmark:
        if base_ckpt_dir:
          deco_print("Starting training from the base model")
        else:
          deco_print("Starting training from scratch")
      else:
        deco_print(
            "Restored checkpoint from {}. Resuming training".format(checkpoint),
        )
  elif args.mode == 'eval' or args.mode == 'infer':
    if hvd is None or hvd.rank() == 0:
      deco_print("Loading model from {}".format(checkpoint))

  # Create model and train/eval/infer
  with tf.Graph().as_default():
    model = create_model(
        args, base_config, config_module, base_model, hvd, checkpoint)
    if args.mode == "train_eval":
      train(model[0], model[1], debug_port=args.debug_port)
    elif args.mode == "train":
      train(model, None, debug_port=args.debug_port)
    elif args.mode == "eval":
      evaluate(model, checkpoint)
    elif args.mode == "infer":
      infer(model, checkpoint, args.infer_output_file)

  if args.enable_logs and (hvd is None or hvd.rank() == 0):
    sys.stdout = old_stdout
    sys.stderr = old_stderr
    stdout_log.close()
    stderr_log.close()
Beispiel #25
0
def main(_):

    print(FLAGS)
    print(tf.__version__, "==tensorflow version==")

    hvd.init()

    init_checkpoint = os.path.join(FLAGS.buckets, FLAGS.init_checkpoint)
    train_file = os.path.join(FLAGS.buckets, FLAGS.train_file)
    dev_file = os.path.join(FLAGS.buckets, FLAGS.dev_file)
    checkpoint_dir = os.path.join(FLAGS.buckets, FLAGS.model_output)

    print(init_checkpoint, train_file, dev_file, checkpoint_dir)

    worker_count = hvd.size()
    task_index = hvd.local_rank()

    is_chief = task_index == 0

    print("==worker_count==", worker_count, "==local_rank==", task_index,
          "==is is_chief==", is_chief)
    cluster = ""
    target = ""

    # FLAGS.config_file = os.path.join(FLAGS.buckets, FLAGS.config_file)
    FLAGS.label_id = os.path.join(FLAGS.buckets, FLAGS.label_id)

    if FLAGS.mode == "single_task":
        train_eval_api = hvd_train_eval
    elif FLAGS.mode == "multi_task":
        train_eval_api = multitask_hvd_train_eval

    if FLAGS.run_type == "sess":
        train_eval_api.monitored_sess(
            FLAGS=FLAGS,
            worker_count=worker_count,
            task_index=task_index,
            cluster=cluster,
            is_chief=is_chief,
            target=target,
            init_checkpoint=init_checkpoint,
            train_file=train_file,
            dev_file=dev_file,
            checkpoint_dir=checkpoint_dir,
            distribution_strategy=FLAGS.distribution_strategy,
            rule_model=FLAGS.rule_model,
            parse_type=FLAGS.parse_type,
            train_op=FLAGS.train_op,
            running_type=FLAGS.running_type,
            input_target=FLAGS.input_target,
            decay=FLAGS.decay,
            warmup=FLAGS.warmup,
            distillation=FLAGS.distillation,
            temperature=FLAGS.temperature,
            distillation_ratio=FLAGS.distillation_ratio)

    elif FLAGS.run_type == "estimator":
        train_eval_api.monitored_estimator(
            FLAGS=FLAGS,
            worker_count=worker_count,
            task_index=task_index,
            cluster=cluster,
            is_chief=is_chief,
            target=target,
            init_checkpoint=init_checkpoint,
            train_file=train_file,
            dev_file=dev_file,
            checkpoint_dir=checkpoint_dir,
            distribution_strategy=FLAGS.distribution_strategy,
            rule_model=FLAGS.rule_model,
            parse_type=FLAGS.parse_type,
            train_op=FLAGS.train_op,
            running_type=FLAGS.running_type,
            input_target=FLAGS.input_target,
            decay=FLAGS.decay,
            warmup=FLAGS.warmup,
            distillation=FLAGS.distillation,
            temperature=FLAGS.temperature,
            distillation_ratio=FLAGS.distillation_ratio)
Beispiel #26
0
def main():
    gpu_thread_count = 2
    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
    hvd.init()

    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.force_gpu_compatible = True  # Force pinned memory
    config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads
    config.inter_op_parallelism_threads = 5
    #config.gpu_options.allow_growth = True
    log_name = 'hvd_train.txt'
    '''
      training stratey
    '''

    training_strategy = [{
        'epoch': [0, 4],
        'lr': [1.0, 3.0],
        'lr_method': 'linear',
        'batch_size': 768,
        'image_size': (128, 128),
        'data_dir': '160',
        'prefix': 'train'
    }, {
        'epoch': [4, 15],
        'lr': [3.0, 0.01],
        'lr_method': 'linear',
        'batch_size': 768,
        'image_size': (128, 128),
        'data_dir': '160',
        'prefix': 'train'
    }, {
        'epoch': [15, 32],
        'lr': [0.2, 0.002],
        'lr_method': 'exp',
        'batch_size': 256,
        'image_size': (224, 224),
        'data_dir': '320',
        'prefix': 'train'
    }, {
        'epoch': [32, 37],
        'lr': [0.003, 0.0005],
        'lr_method': 'linear',
        'batch_size': 128,
        'image_size': (288, 288),
        'data_dir': '320',
        'prefix': 'train'
    }]

    training_strategy = [{
        'epoch': [0, 6],
        'lr': [1.0, 2.0],
        'lr_method': 'linear',
        'batch_size': 740,
        'image_size': (128, 128),
        'data_dir': '160',
        'prefix': 'train'
    }, {
        'epoch': [6, 21],
        'lr': [2.0, 0.45],
        'lr_method': 'linear',
        'batch_size': 740,
        'image_size': (128, 128),
        'data_dir': '160',
        'prefix': 'train'
    }, {
        'epoch': [21, 32],
        'lr': [0.45, 0.02],
        'lr_method': 'exp',
        'batch_size': 256,
        'image_size': (224, 224),
        'data_dir': '320',
        'prefix': 'train'
    }, {
        'epoch': [32, 36],
        'lr': [0.02, 0.004],
        'lr_method': 'exp',
        'batch_size': 196,
        'image_size': (224, 224),
        'data_dir': '320',
        'prefix': 'train'
    }, {
        'epoch': [36, 40],
        'lr': [0.004, 0.002],
        'lr_method': 'exp',
        'batch_size': 128,
        'image_size': (288, 288),
        'data_dir': '320',
        'prefix': 'train'
    }]

    num_training_samples = 1281167
    num_eval_samples = 50000

    cmdline = add_cli_args()
    FLAGS, unknown_args = cmdline.parse_known_args()

    do_checkpoint = hvd.rank() == 0

    barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
    tf.Session(config=config).run(barrier)

    if hvd.local_rank() == 0 and not os.path.isdir(FLAGS.log_dir):
        os.makedirs(FLAGS.log_dir)

    barrier = hvd.allreduce(tf.constant(0, dtype=tf.float32))
    tf.Session(config=config).run(barrier)

    logger = logging.getLogger(log_name)
    logger.setLevel(logging.INFO)  # INFO, ERROR

    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)

    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler(os.path.join(FLAGS.log_dir, log_name))
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    # add handlers to logger
    logger.addHandler(fh)

    if not FLAGS.save_checkpoints_steps:
        # default to save one checkpoint per epoch
        FLAGS.save_checkpoints_steps = 625
    if not FLAGS.save_summary_steps:
        # default to save one checkpoint per epoch
        FLAGS.save_summary_steps = 625

    data_strategy, lr_strategy = dynamicpipe.lr_strategy_parsing(
        training_strategy, num_training_samples, FLAGS.num_gpus)

    num_steps = lr_strategy[-1]['steps'][-1] + FLAGS.display_every

    rank0log(logger, 'Data strategy: ' + str(data_strategy))
    rank0log(logger, 'Learning rate strategy:' + str(lr_strategy))
    rank0log(logger, 'Total Max Training Steps: ' + str(num_steps))
    rank0log(
        logger,
        'Checkpointing every ' + str(FLAGS.save_checkpoints_steps) + ' steps')
    rank0log(
        logger,
        'Saving summary every ' + str(FLAGS.save_summary_steps) + ' steps')

    rank0log(logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
    rank0log(logger, "Horovod size: ", hvd.size())

    classifier = tf.estimator.Estimator(
        model_fn=cnn_model_function,
        model_dir=FLAGS.log_dir,
        params={
            'n_classes': 1000,
            'mom': FLAGS.mom,
            'num_steps': num_steps,
            'wdecay': FLAGS.wdecay,
            'loss_scale': FLAGS.loss_scale,
            'num_training_samples': num_training_samples,
            'lr_strategy': lr_strategy
        },
        config=tf.estimator.RunConfig(
            session_config=config,
            save_summary_steps=FLAGS.save_summary_steps
            if do_checkpoint else None,
            save_checkpoints_steps=FLAGS.save_checkpoints_steps
            if do_checkpoint else None,
            keep_checkpoint_max=None))

    num_preproc_threads = 6
    rank0log(logger, "Using preprocessing threads per GPU: ",
             num_preproc_threads)
    training_hooks = [
        hvd.BroadcastGlobalVariablesHook(0),
        PrefillStagingAreasHook()
    ]
    if hvd.rank() == 0:
        training_hooks.append(
            LogSessionRunHook(num_training_samples, FLAGS.num_gpus,
                              FLAGS.display_every, logger))
    start_time = time.time()
    classifier.train(
        input_fn=lambda: dynamicpipe.data_pipeline(num_training_samples,
                                                   FLAGS.num_gpus,
                                                   data_strategy,
                                                   FLAGS.data_dir,
                                                   mode="TRAINING"),
        max_steps=num_steps,
        hooks=training_hooks)
    rank0log(logger, "Log: Finished in ", time.time() - start_time)

    rank0log(logger, "Log: Evaluating")
    rank0log(logger, "Log: Validation dataset size: 50000")
    eval_strategy = [{
        'epoch': 1,
        'batch_size': 128,
        'image_size': (288, 288),
        'data_dir': '320',
        'prefix': 'validation'
    }]

    #evaluation on single GPU
    #if hvd.rank() == 0:
    rank0log(logger, ' step  top1    top5     loss   checkpoint_time(UTC)')
    ckpts = sort_and_load_ckpts(FLAGS.log_dir)
    for i, c in enumerate(ckpts):
        if hvd.rank() == i % FLAGS.num_gpus:
            eval_result = classifier.evaluate(
                input_fn=lambda: dynamicpipe.data_pipeline(num_eval_samples,
                                                           1,
                                                           eval_strategy,
                                                           FLAGS.data_dir,
                                                           mode="EVAL"),
                checkpoint_path=c['path'])
            c['epoch'] = i
            c['top1'] = eval_result['val-top1acc']
            c['top5'] = eval_result['val-top5acc']
            c['loss'] = eval_result['loss']
            logger.info(
                'Log @eval: count@{:5d} step@{:5d} top1@{:5.3f} top5@{:6.2f} loss@{:6.2f} time@{time}'
                .format(c['epoch'],
                        c['step'],
                        c['top1'] * 100,
                        c['top5'] * 100,
                        c['loss'],
                        time=time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(c['mtime']))))

    rank0log(logger, "Log Finished evaluation")
Beispiel #27
0
def main(_):

    hvd.init()

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

    graph = tf.Graph()
    with graph.as_default():
        import json

        config = json.load(open(FLAGS.config_file, "r"))
        init_checkpoint = FLAGS.init_checkpoint

        config = Bunch(config)
        config.use_one_hot_embeddings = True
        config.scope = "bert"
        config.dropout_prob = 0.1
        config.label_type = "single_label"

        if FLAGS.if_shard == "0":
            train_size = FLAGS.train_size
            epoch = int(FLAGS.epoch / hvd.size())
        elif FLAGS.if_shard == "1":
            train_size = int(FLAGS.train_size / hvd.size())
            epoch = FLAGS.epoch

        init_lr = 2e-5

        num_train_steps = int(train_size / FLAGS.batch_size * epoch)
        num_warmup_steps = int(num_train_steps * 0.1)

        num_storage_steps = int(train_size / FLAGS.batch_size)

        print(" model type {}".format(FLAGS.model_type))

        print(num_train_steps, num_warmup_steps, "=============")

        opt_config = Bunch({
            "init_lr": init_lr / hvd.size(),
            "num_train_steps": num_train_steps,
            "num_warmup_steps": num_warmup_steps
        })

        sess = tf.Session(config=sess_config)

        model_io_config = Bunch({"fix_lm": False})

        model_io_fn = model_io.ModelIO(model_io_config)

        optimizer_fn = optimizer.Optimizer(opt_config)

        num_classes = FLAGS.num_classes

        model_train_fn = bert_classifier.classifier_model_fn_builder(
            config,
            num_classes,
            init_checkpoint,
            reuse=None,
            load_pretrained=True,
            model_io_fn=model_io_fn,
            optimizer_fn=optimizer_fn,
            model_io_config=model_io_config,
            opt_config=opt_config)

        model_eval_fn = bert_classifier.classifier_model_fn_builder(
            config,
            num_classes,
            init_checkpoint,
            reuse=True,
            load_pretrained=True,
            model_io_fn=model_io_fn,
            optimizer_fn=optimizer_fn,
            model_io_config=model_io_config,
            opt_config=opt_config)

        def metric_fn(features, logits, loss):
            print(logits.get_shape(), "===logits shape===")
            pred_label = tf.argmax(logits, axis=-1, output_type=tf.int32)
            prob = tf.nn.softmax(logits)
            accuracy = correct = tf.equal(
                tf.cast(pred_label, tf.int32),
                tf.cast(features["label_ids"], tf.int32))
            accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
            return {
                "accuracy": accuracy,
                "loss": loss,
                "pred_label": pred_label,
                "label_ids": features["label_ids"]
            }

        name_to_features = {
            "input_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "input_mask": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "segment_ids": tf.FixedLenFeature([FLAGS.max_length], tf.int64),
            "label_ids": tf.FixedLenFeature([], tf.int64),
        }

        def _decode_record(record, name_to_features):
            """Decodes a record to a TensorFlow example.
			"""
            example = tf.parse_single_example(record, name_to_features)

            # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
            # So cast all int64 to int32.
            for name in list(example.keys()):
                t = example[name]
                if t.dtype == tf.int64:
                    t = tf.to_int32(t)
                example[name] = t
            return example

        params = Bunch({})
        params.epoch = FLAGS.epoch
        params.batch_size = FLAGS.batch_size

        train_features = tf_data_utils.train_input_fn(FLAGS.train_file,
                                                      _decode_record,
                                                      name_to_features,
                                                      params,
                                                      if_shard=FLAGS.if_shard)
        eval_features = tf_data_utils.eval_input_fn(FLAGS.dev_file,
                                                    _decode_record,
                                                    name_to_features,
                                                    params,
                                                    if_shard=FLAGS.if_shard)

        [train_op, train_loss, train_per_example_loss,
         train_logits] = model_train_fn(train_features, [],
                                        tf.estimator.ModeKeys.TRAIN)
        [_, eval_loss, eval_per_example_loss,
         eval_logits] = model_eval_fn(eval_features, [],
                                      tf.estimator.ModeKeys.EVAL)
        result = metric_fn(eval_features, eval_logits, eval_loss)

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)

        sess.run(hvd.broadcast_global_variables(0))

        model_io_fn.set_saver()

        print("===horovod rank==={}".format(hvd.rank()))

        def eval_fn(result):
            i = 0
            total_accuracy = 0
            label, label_id = [], []
            while True:
                try:
                    eval_result = sess.run(result)
                    total_accuracy += eval_result["accuracy"]
                    label_id.extend(eval_result["label_ids"])
                    label.extend(eval_result["pred_label"])
                    i += 1
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break
            macro_f1 = f1_score(label_id, label, average="macro")
            micro_f1 = f1_score(label_id, label, average="micro")
            macro_precision = precision_score(label_id, label, average="macro")
            micro_precision = precision_score(label_id, label, average="micro")
            macro_recall = recall_score(label_id, label, average="macro")
            micro_recall = recall_score(label_id, label, average="micro")
            accuracy = accuracy_score(label_id, label)
            print("test accuracy {} macro_f1 score {} micro_f1 {} accuracy {}".
                  format(total_accuracy / i, macro_f1, micro_f1, accuracy))
            return total_accuracy / i, label_id, label

        def train_fn(op, loss):
            i = 0
            total_loss = 0
            cnt = 0
            while True:
                try:
                    [_, train_loss] = sess.run([op, loss])
                    i += 1
                    cnt += 1
                    total_loss += train_loss
                    # print("==device id {} global step {}".format(hvd.rank(), step))
                    if np.mod(i, num_storage_steps) == 0:
                        print(total_loss / cnt)
                        if hvd.rank() == 0:
                            model_io_fn.save_model(
                                sess,
                                FLAGS.model_output + "/oqmrc_{}.ckpt".format(
                                    int(i / num_storage_steps)))
                        cnt = 0
                        total_loss = 0
                except tf.errors.OutOfRangeError:
                    print("End of dataset")
                    break

        import time
        import time
        start = time.time()
        train_fn(train_op, train_loss)
        acc, true_label, pred_label = eval_fn(result)
        end = time.time()
        print("==total time {} numbers of devices {}".format(
            end - start, hvd.size()))
        if hvd.rank() == 0:
            model_io_fn.save_model(sess, FLAGS.model_output + "/oqmrc.ckpt")
            import _pickle as pkl
            pkl.dump({
                "true_label": true_label,
                "pred_label": pred_label
            }, open(FLAGS.model_output + "/eval_result.json", "wb"))
        else:
            pred = OfflinePredictor(PredictConfig(
                model=MODEL,
                session_init=get_model_loader(args.load),
                input_names=MODEL.get_inference_tensor_names()[0],
                output_names=MODEL.get_inference_tensor_names()[1]))
            if args.evaluate:
                assert args.evaluate.endswith('.json'), args.evaluate
                offline_evaluate(pred, args.evaluate)
            elif args.predict:
                COCODetection(cfg.DATA.BASEDIR, 'val2014')   # Only to load the class names into caches
                predict(pred, args.predict)
    else:
        is_horovod = cfg.TRAINER == 'horovod'
        if is_horovod:
            hvd.init()
            logger.info("Horovod Rank={}, Size={}".format(hvd.rank(), hvd.size()))

        if not is_horovod or hvd.rank() == 0:
            logger.set_logger_dir(args.logdir, 'd')

        finalize_configs(is_training=True)
        stepnum = cfg.TRAIN.STEPS_PER_EPOCH

        # warmup is step based, lr is epoch based
        init_lr = cfg.TRAIN.BASE_LR * 0.33 * min(8. / cfg.TRAIN.NUM_GPUS, 1.)
        warmup_schedule = [(0, init_lr), (cfg.TRAIN.WARMUP, cfg.TRAIN.BASE_LR)]
        warmup_end_epoch = cfg.TRAIN.WARMUP * 1. / stepnum
        lr_schedule = [(int(warmup_end_epoch + 0.5), cfg.TRAIN.BASE_LR)]

        factor = 8. / cfg.TRAIN.NUM_GPUS
Beispiel #29
0
def main(_):

	hvd.init()

	sess_config = tf.ConfigProto()
	sess_config.gpu_options.visible_device_list = str(hvd.local_rank())

	graph = tf.Graph()
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
	with graph.as_default():
		import json
		
		# config = json.load(open("/data/xuht/bert/chinese_L-12_H-768_A-12/bert_config.json", "r"))
		
		config = json.load(open(FLAGS.config_file, "r"))

		init_checkpoint = FLAGS.init_checkpoint
		print("===init checkoutpoint==={}".format(init_checkpoint))

		config = Bunch(config)
		config.use_one_hot_embeddings = True
		config.scope = "bert"
		config.dropout_prob = 0.1
		config.label_type = "single_label"
		config.lm_ratio = 0.1
		config.task_ratio = 1.0

		json.dump(config, open(FLAGS.model_output+"/config.json", "w"))

		init_lr = 2e-5

		if FLAGS.if_shard == "0":
			train_size = FLAGS.train_size
			epoch = int(FLAGS.epoch / hvd.size())
		elif FLAGS.if_shard == "1":
			train_size = int(FLAGS.train_size/hvd.size())
			epoch = FLAGS.epoch

		sess = tf.Session(config=sess_config)

		num_train_steps = int(
			train_size / FLAGS.batch_size * epoch)
		num_warmup_steps = int(num_train_steps * 0.1)

		num_storage_steps = int(train_size / FLAGS.batch_size)

		print(num_train_steps, num_warmup_steps, "=============")
		
		opt_config = Bunch({"init_lr":init_lr/(hvd.size()), 
							"num_train_steps":num_train_steps,
							"num_warmup_steps":num_warmup_steps})

		model_io_config = Bunch({"fix_lm":False})
		
		model_io_fn = model_io.ModelIO(model_io_config)

		optimizer_fn = optimizer.Optimizer(opt_config)
		
		num_choice = FLAGS.num_classes
		max_seq_length = FLAGS.max_length
		max_predictions_per_seq = FLAGS.max_predictions_per_seq

		model_train_fn = classifier_fn.classifier_model_fn_builder(config, 
												num_choice, init_checkpoint, 
												reuse=None, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config)


		model_eval_fn = classifier_fn.classifier_model_fn_builder(config, 
												num_choice, init_checkpoint, 
												reuse=True, 
												load_pretrained=True,
												model_io_fn=model_io_fn,
												optimizer_fn=optimizer_fn,
												model_io_config=model_io_config, 
												opt_config=opt_config)
		
		name_to_features = {
				"input_ids":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"input_mask":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"segment_ids":
					tf.FixedLenFeature([max_seq_length], tf.int64),
				"masked_lm_positions":
					tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
				"masked_lm_ids":
					tf.FixedLenFeature([max_predictions_per_seq], tf.int64),
				"masked_lm_weights":
					tf.FixedLenFeature([max_predictions_per_seq], tf.float32),
				"label_ids":
					tf.FixedLenFeature([], tf.int64),
				}

		def _decode_record(record, name_to_features):
			"""Decodes a record to a TensorFlow example.
			"""
			example = tf.parse_single_example(record, name_to_features)

			# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
			# So cast all int64 to int32.
			for name in list(example.keys()):
				t = example[name]
				if t.dtype == tf.int64:
					t = tf.to_int32(t)
				example[name] = t
			return example 

		params = Bunch({})
		params.epoch = epoch
		params.batch_size = FLAGS.batch_size

		def parse_folder(path):
			files = os.listdir(path)
			output = []
			for file_name in files:
				output.append(os.path.join(path, file_name))
			random.shuffle(output)
			return output

		train_features = tf_data_utils.train_input_fn(
									parse_folder(FLAGS.train_file),
									_decode_record, name_to_features, params)
		train_dict = model_train_fn(train_features, [], tf.estimator.ModeKeys.TRAIN)

		eval_features = tf_data_utils.eval_input_fn(
										parse_folder(FLAGS.dev_file),
										_decode_record, name_to_features, params)
		eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)

		model_io_fn.set_saver()
		
		init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
		sess.run(init_op)

		sess.run(hvd.broadcast_global_variables(0))
		
		def eval_fn(op_dict):
			def eval_fn(op_dict):
			i = 0
			eval_total_dict = {}
			while True:
				try:
					eval_result = sess.run(op_dict)
					for key in eval_result:
						if key in ["probabilities", "label_ids"]:
							if key in eval_total_dict:
								eval_total_dict[key].extend(eval_result[key])
							else:
								eval_total_dict[key] = []
								eval_total_dict[key].extend(eval_result[key])
					i += 1
				except tf.errors.OutOfRangeError:
					print("End of dataset")
					break

			for key in eval_result:
				if key not in ["probabilities", "label_ids"]:
					eval_total_dict[key] = eval_result[key]

			label_id = eval_total_dict["label_ids"]
			label = np.argmax(np.array(eval_total_dict["probabilities"]), axis=-1)

			macro_f1 = f1_score(label_id, label, average="macro")
			micro_f1 = f1_score(label_id, label, average="micro")
			accuracy = accuracy_score(label_id, label)
			print("test accuracy {} macro_f1 score {} micro_f1 {} masked_lm_accuracy {} sentence_f {}".format(accuracy, 
																		macro_f1,  micro_f1, 
																		eval_total_dict["masked_lm_accuracy"],
																		eval_total_dict["sentence_f"]))
			
			return eval_total_dict

		def run_eval(steps):
			import _pickle as pkl
			eval_features = tf_data_utils.eval_input_fn(
										parse_folder(FLAGS.dev_file),
										_decode_record, name_to_features, params)
			eval_dict = model_eval_fn(eval_features, [], tf.estimator.ModeKeys.EVAL)
			sess.run(tf.local_variables_initializer())
			eval_finial_dict = eval_fn(eval_dict)
			if hvd.rank() == 0:
				pkl.dump(eval_finial_dict, open(FLAGS.model_output+"/eval_dict_{}.pkl".format(steps), "wb"))
			return eval_finial_dict
		
		def train_fn(op_dict):
			i = 0
			cnt = 0
			loss_dict = {}
			monitoring_train = []
			monitoring_eval = []
			while True:
				try:
					train_result = sess.run(op_dict)
					for key in train_result:
						if key == "train_op":
							continue
						else:
							if np.isnan(train_result[key]):
								print(train_loss, "get nan loss")
								break
							else:
								if key in loss_dict:
									loss_dict[key] += train_result[key]
								else:
									loss_dict[key] = train_result[key]
					i += 1
					cnt += 1
					if np.mod(i, num_storage_steps) == 0:
						string = ""
						for key in loss_dict:
							tmp = key + " " + str(loss_dict[key]/cnt) + "\t"
							string += tmp
						print(string)
						monitoring_train.append(loss_dict)

						eval_finial_dict = run_eval(int(i/num_storage_steps))
						monitoring_eval.append(eval_finial_dict)

						for key in loss_dict:
							loss_dict[key] = 0.0
						if hvd.rank() == 0:
							model_io_fn.save_model(sess, FLAGS.model_output+"/model_{}.ckpt".format(int(i/num_storage_steps)))
							print("==successful storing model=={}".format(int(i/num_storage_steps)))
						cnt = 0
				except tf.errors.OutOfRangeError:
					if hvd.rank() == 0:
						import _pickle as pkl
						pkl.dump({"train":monitoring_train,
							"eval":monitoring_eval}, open(FLAGS.model_output+"/monitoring.pkl", "wb"))
					break
		print("===========begin to train============")        
		train_fn(train_dict)
		if hvd.rank() == 0:
			import _pickle as pkl
			model_io_fn.save_model(sess, FLAGS.model_output+"/model.ckpt")
			print("===========begin to eval============")
			eval_finial_dict = run_eval("final")

if __name__ == "__main__":
	tf.app.run()
Beispiel #30
0
def main(_):
    hvd.init()
    FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else os.path.join(
        FLAGS.output_dir, str(hvd.rank()))
    FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
    FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()

    tf.logging.set_verbosity(tf.logging.INFO)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(bert_config_file.name)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

    config = tf.ConfigProto()
    config.gpu_options.visible_device_list = str(hvd.local_rank())

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        log_step_count_steps=25,
        session_config=config)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True)

        hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps,
                        hooks=hooks)

    if FLAGS.do_eval:
        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Beispiel #31
0
def main() -> None:

    # Horovod Init
    hvd.init()
    size = hvd.size()

    # Config GPUs
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    # get optimizer & loss function
    loss_function = tf.keras.losses.SparseCategoricalCrossentropy()
    opt = tf.keras.optimizers.Adam(lr=Config.LEARNING_RATE * size)

    # Data
    imagenet = ImageNet(take=20)
    train_ds, val_ds = imagenet.train_ds, imagenet.val_ds
    n_train_batches = train_ds.cardinality().numpy()
    n_val_batches = val_ds.cardinality().numpy()

    # Callbacks
    callbacks = []
    callbacks.append(hvdK.callbacks.BroadcastGlobalVariablesCallback(0))
    callbacks.append(hvdK.callbacks.MetricAverageCallback())
    callbacks.append(
        hvdK.callbacks.LearningRateWarmupCallback(
            warmup_epochs=5, initial_lr=Config.LEARNING_RATE))
    callbacks.append(
        tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))

    if hvd.rank() == 0:

        ckpt_dir = Config.SAVED_WEIGHTS_DIR + "/" + Config.RUN_NAME
        if not os.path.exists(ckpt_dir):
            os.makedirs(ckpt_dir)

        ckpt = tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_dir+ \
                                                    "/epoch-{epoch:02d}-loss={val_loss:.2f}.h5",
                                                    monitor='val_loss', save_best_only=True, mode='min')

        log_dir = Config.LOG_DIR + "/" + Config.RUN_NAME
        tensorboard = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

        callbacks.append(ckpt)
        callbacks.append(tensorboard)
        callbacks.append(tfa.callbacks.TQDMProgressBar())

    # Model
    model = ResNet50()
    model.loss_function = loss_function
    model.train_step = types.MethodType(distributed_train_step, model)
    model.compile(optimizer=opt, loss=loss_function)

    # Train
    model.fit(train_ds,
              steps_per_epoch=n_train_batches // size,
              validation_data=val_ds,
              validation_steps=n_val_batches // size,
              epochs=Config.EPOCHS,
              verbose=0,
              callbacks=callbacks)