Exemple #1
0
print 'Done.\n'

try:
    # This metadata is attached to all instances to allow cleanup to find
    # stale instances made by this utility
    instance_metadata = {
        'pulp_instance': 'True',
        'build_time': str(time.time()),
    }
    print 'Deploying instances...'
    os1.build_instances(config, instance_metadata)

    # Save the configuration for cleanup immediately since the configuration can fail
    if args.deployed_config is None:
        args.deployed_config = args.config[0] + '.json'
    config_utils.save_config(config, args.deployed_config)

    print 'Applying role-specific configurations...'
    setup_utils.configure_instances(config)

    # Print out machine information and configuration
    print '\nThe following instances have been built:'
    for instance in config_utils.config_generator(config):
        print """
            Instance name: %(instance_name)s
            Role: %(role)s
            SSH: %(host_string)s
        """ % instance
    print 'The configuration file has been written to ' + args.deployed_config
except (Exception, KeyboardInterrupt), e:
    # Print exception message and quit
Exemple #2
0
def main(_):
    log_dir = FLAGS.model_dir
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    path_prefix = log_dir + "/"
    log_file_path = path_prefix + "log.txt"
    print('Log file path: {}'.format(log_file_path))
    log_file = open(log_file_path, 'wt')
    log_file.write("{}\n".format(FLAGS))
    log_file.flush()

    # save configuration
    config_utils.save_config(FLAGS, path_prefix + "config.json")

    print('Loading train set.')
    if FLAGS.data_split == 1:
        train_set, train_question_len = read_data_split_1(
            FLAGS.s1_train_path, isLower=FLAGS.isLower)
    else:
        train_set, train_question_len = read_data_split_2(
            FLAGS.s2_train_path, isLower=FLAGS.isLower)
    print('Number of training samples: {}'.format(len(train_set)))

    print('Loading test set.')
    if FLAGS.data_split == 1:
        dev_set, dev_question_len = read_data_split_1(FLAGS.s1_dev_path,
                                                      isLower=FLAGS.isLower)
    else:
        dev_set, dev_question_len = read_data_split_2(FLAGS.s2_dev_path,
                                                      isLower=FLAGS.isLower)
    print('Number of test samples: {}'.format(len(dev_set)))

    max_actual_len = max(train_question_len, dev_question_len)
    print('Max answer length: {}, truncated to {}'.format(
        max_actual_len, FLAGS.max_question_len))

    word_vocab = None
    POS_vocab = None
    NER_vocab = None
    has_pretrained_model = False
    best_path = path_prefix + "best.model"
    if os.path.exists(best_path + ".index"):
        has_pretrained_model = True
        print('There is an existing pretrained model. Loading vocabs:')
        if FLAGS.with_word:
            word_vocab = Vocab(embedding_path=FLAGS.word_vec_path)
            print('word_vocab: {}'.format(word_vocab.word_vecs.shape))
        if FLAGS.with_POS:
            POS_vocab = Vocab(
                embedding_path=os.path.join(path_prefix, "POS_vocab"))
            print('POS_vocab: {}'.format(POS_vocab.word_vecs.shape))
        if FLAGS.with_NER:
            NER_vocab = Vocab(
                embedding_path=os.path.join(path_prefix, "NER_vocab"))
            print('NER_vocab: {}'.format(NER_vocab.word_vecs.shape))

    else:
        print('Collecting vocabs.')
        (allWords, allPOSs, allNERs) = collect_vocabs(train_set)
        print('Number of words: {}'.format(len(allWords)))
        print('Number of allPOSs: {}'.format(len(allPOSs)))
        print('Number of allNERs: {}'.format(len(allNERs)))

        if FLAGS.with_word:
            word_vocab = Vocab(embedding_path=FLAGS.word_vec_path)
        if FLAGS.with_POS:
            POS_vocab = Vocab(vocab=allPOSs, dim=FLAGS.POS_dim)
            POS_vocab.dump_to_txt(os.path.join(path_prefix, "POS_vocab"))
        if FLAGS.with_NER:
            NER_vocab = Vocab(vocab=allNERs, dim=FLAGS.NER_dim)
            NER_vocab.dump_to_txt(os.path.join(path_prefix, "NER_vocab"))

    print('word vocab size {}'.format(word_vocab.vocab_size))
    sys.stdout.flush()

    print('Build data loaders ... ')
    train_data_loader = QGDataLoader(train_set,
                                     word_vocab,
                                     POS_vocab,
                                     NER_vocab,
                                     flags=FLAGS,
                                     isShuffle=True,
                                     isLoop=True,
                                     isSort=True)

    dev_data_loader = QGDataLoader(dev_set,
                                   word_vocab,
                                   POS_vocab,
                                   NER_vocab,
                                   flags=FLAGS,
                                   isShuffle=False,
                                   isLoop=False,
                                   isSort=True)
    print('Number of instances in train data loader: {}'.format(
        train_data_loader.get_num_instance()))
    print('Number of instances in dev data loader: {}'.format(
        dev_data_loader.get_num_instance()))
    sys.stdout.flush()

    # initialize the best bleu and accu scores for current training session
    best_accu = FLAGS.best_accu if 'best_accu' in FLAGS.__dict__ else 0.0
    best_bleu = FLAGS.best_bleu if 'best_bleu' in FLAGS.__dict__ else 0.0
    if best_accu > 0.0:
        print('With initial dev accuracy {}'.format(best_accu))
    if best_bleu > 0.0:
        print('With initial dev BLEU score {}'.format(best_bleu))

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-0.01, 0.01)
        with tf.name_scope("Train"):
            with tf.variable_scope("Model",
                                   reuse=None,
                                   initializer=initializer):
                train_graph = ModelGraph(word_vocab=word_vocab,
                                         POS_vocab=POS_vocab,
                                         NER_vocab=NER_vocab,
                                         flags=FLAGS,
                                         mode=FLAGS.mode)

        assert FLAGS.mode in ('ce_train', 'rl_train', 'rl_ce_train')
        valid_mode = 'evaluate' if FLAGS.mode == 'ce_train' else 'evaluate_bleu'

        with tf.name_scope("Valid"):
            with tf.variable_scope("Model",
                                   reuse=True,
                                   initializer=initializer):
                valid_graph = ModelGraph(word_vocab=word_vocab,
                                         POS_vocab=POS_vocab,
                                         NER_vocab=NER_vocab,
                                         flags=FLAGS,
                                         mode=valid_mode)

        initializer = tf.global_variables_initializer()

        _vars = {}
        for var in tf.all_variables():
            if "word_embedding" in var.name: continue
            if not var.name.startswith("Model"): continue
            _vars[var.name.split(":")[0]] = var
        saver = tf.train.Saver(_vars)

        config = tf.ConfigProto(inter_op_parallelism_threads=1,
                                intra_op_parallelism_threads=1)
        sess = tf.Session(config=config)
        sess.run(initializer)

        if has_pretrained_model:
            print("Restoring model from " + best_path)
            saver.restore(sess, best_path)
            print("DONE!")

            if FLAGS.mode in ('rl_train',
                              'rl_ce_train') and abs(best_bleu) < 0.00001:
                print("Getting BLEU score for the model")
                best_bleu = evaluate(sess,
                                     valid_graph,
                                     dev_data_loader,
                                     flags=FLAGS)['dev_bleu']
                FLAGS.best_bleu = best_bleu
                config_utils.save_config(FLAGS, path_prefix + "config.json")
                print('BLEU = %.4f' % best_bleu)
                log_file.write('BLEU = %.4f\n' % best_bleu)
            if FLAGS.mode == 'ce_train' and abs(best_accu) < 0.00001:
                print("Getting ACCU score for the model")
                best_accu = evaluate(sess,
                                     valid_graph,
                                     dev_data_loader,
                                     flags=FLAGS)['dev_accu']
                FLAGS.best_accu = best_accu
                config_utils.save_config(FLAGS, path_prefix + "config.json")
                print('ACCU = %.4f' % best_accu)
                log_file.write('ACCU = %.4f\n' % best_accu)

        print('Start the training loop.')
        train_size = train_data_loader.get_num_batch()
        max_steps = train_size * FLAGS.n_epochs
        total_loss = 0.0
        start_time = time.time()
        for step in xrange(max_steps):
            cur_batch = train_data_loader.nextBatch()
            if FLAGS.mode == 'rl_train':
                loss_value = train_graph.rl_train(sess,
                                                  cur_batch,
                                                  with_ce=False)
            elif FLAGS.mode == 'rl_ce_train':
                loss_value = train_graph.rl_train(sess,
                                                  cur_batch,
                                                  with_ce=True)
            elif FLAGS.mode == 'ce_train':
                loss_value = train_graph.ce_train(sess, cur_batch)
            total_loss += loss_value

            if step % 100 == 0:
                print('{} '.format(step), end="")
                sys.stdout.flush()

            # Save a checkpoint and evaluate the model periodically.
            if (step + 1) % train_data_loader.get_num_batch() == 0 or (
                    step + 1) == max_steps:
                duration = time.time() - start_time
                print('Step %d: loss = %.2f (%.3f sec)' %
                      (step, total_loss, duration))
                log_file.write('Step %d: loss = %.2f (%.3f sec)\n' %
                               (step, total_loss, duration))
                log_file.flush()
                sys.stdout.flush()
                total_loss = 0.0

                # Evaluate against the validation set.
                start_time = time.time()
                sess.run(train_graph.ema_to_vars_op)
                res_dict = evaluate(sess,
                                    valid_graph,
                                    dev_data_loader,
                                    flags=FLAGS,
                                    suffix=str(step))
                if valid_graph.mode == 'evaluate':
                    dev_loss = res_dict['dev_loss']
                    dev_accu = res_dict['dev_accu']
                    dev_right = int(res_dict['dev_right'])
                    dev_total = int(res_dict['dev_total'])
                    print('Dev loss = %.4f' % dev_loss)
                    log_file.write('Dev loss = %.4f\n' % dev_loss)
                    print('Dev accu = %.4f %d/%d' %
                          (dev_accu, dev_right, dev_total))
                    log_file.write('Dev accu = %.4f %d/%d\n' %
                                   (dev_accu, dev_right, dev_total))
                    log_file.flush()
                    if best_accu < dev_accu:
                        print('Saving weights, ACCU {} (prev_best) < {} (cur)'.
                              format(best_accu, dev_accu))
                        saver.save(sess, best_path)
                        best_accu = dev_accu
                        FLAGS.best_accu = dev_accu
                        config_utils.save_config(FLAGS,
                                                 path_prefix + "config.json")
                else:
                    dev_bleu = res_dict['dev_bleu']
                    print('Dev bleu = %.4f' % dev_bleu)
                    log_file.write('Dev bleu = %.4f\n' % dev_bleu)
                    log_file.flush()
                    if best_bleu < dev_bleu:
                        print('Saving weights, BLEU {} (prev_best) < {} (cur)'.
                              format(best_bleu, dev_bleu))
                        saver.save(sess, best_path)
                        best_bleu = dev_bleu
                        FLAGS.best_bleu = dev_bleu
                        config_utils.save_config(FLAGS,
                                                 path_prefix + "config.json")
                sess.run(train_graph.restore_backup_vars_op)
                duration = time.time() - start_time
                print('Duration %.3f sec' % (duration))
                sys.stdout.flush()

                log_file.write('Duration %.3f sec\n' % (duration))
                log_file.flush()

    log_file.close()
def main(argv):
  del argv  # Unused.

  params = factory.config_generator(FLAGS.model)

  if FLAGS.config_file:
    params = params_dict.override_params_dict(
        params, FLAGS.config_file, is_strict=True)

  params = params_dict.override_params_dict(
      params, FLAGS.params_override, is_strict=True)
  if not FLAGS.use_tpu:
    params.override({
        'architecture': {
            'use_bfloat16': False,
        },
        'batch_norm_activation': {
            'use_sync_bn': False,
        },
    }, is_strict=True)
  params.override({
      'platform': {
          'eval_master': FLAGS.eval_master,
          'tpu': FLAGS.tpu,
          'tpu_zone': FLAGS.tpu_zone,
          'gcp_project': FLAGS.gcp_project,
      },
      'tpu_job_name': FLAGS.tpu_job_name,
      'use_tpu': FLAGS.use_tpu,
      'model_dir': FLAGS.model_dir,
      'train': {
          'num_shards': FLAGS.num_cores,
      },
  }, is_strict=False)
  # Only run spatial partitioning in training mode.
  if FLAGS.mode != 'train':
    params.train.input_partition_dims = None
    params.train.num_cores_per_replica = None

  params.validate()
  params.lock()
  pp = pprint.PrettyPrinter()
  params_str = pp.pformat(params.as_dict())
  logging.info('Model Parameters: %s', params_str)

  # Builds detection model on TPUs.
  model_fn = model_builder.ModelFn(params)
  executor = tpu_executor.TpuExecutor(model_fn, params)

  # Prepares input functions for train and eval.
  train_input_fn = input_reader.InputFn(
      params.train.train_file_pattern, params, mode=ModeKeys.TRAIN,
      dataset_type=params.train.train_dataset_type)
  if params.eval.type == 'customized':
    eval_input_fn = input_reader.InputFn(
        params.eval.eval_file_pattern, params, mode=ModeKeys.EVAL,
        dataset_type=params.eval.eval_dataset_type)
  else:
    eval_input_fn = input_reader.InputFn(
        params.eval.eval_file_pattern, params, mode=ModeKeys.PREDICT_WITH_GT,
        dataset_type=params.eval.eval_dataset_type)

  # Runs the model.
  if FLAGS.mode == 'train':
    config_utils.save_config(params, params.model_dir)
    executor.train(train_input_fn, params.train.total_steps)
    if FLAGS.eval_after_training:
      executor.evaluate(
          eval_input_fn,
          params.eval.eval_samples // params.eval.eval_batch_size)

  elif FLAGS.mode == 'eval':
    def terminate_eval():
      logging.info('Terminating eval after %d seconds of no checkpoints',
                   params.eval.eval_timeout)
      return True
    # Runs evaluation when there's a new checkpoint.
    for ckpt in tf.train.checkpoints_iterator(
        params.model_dir,
        min_interval_secs=params.eval.min_eval_interval,
        timeout=params.eval.eval_timeout,
        timeout_fn=terminate_eval):
      # Terminates eval job when final checkpoint is reached.
      current_step = int(os.path.basename(ckpt).split('-')[1])

      logging.info('Starting to evaluate.')
      try:
        executor.evaluate(
            eval_input_fn,
            params.eval.eval_samples // params.eval.eval_batch_size, ckpt)

        if current_step >= params.train.total_steps:
          logging.info('Evaluation finished after training step %d',
                       current_step)
          break
      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        logging.info('Checkpoint %s no longer exists, skipping checkpoint',
                     ckpt)

  elif FLAGS.mode == 'train_and_eval':
    config_utils.save_config(params, params.model_dir)
    num_cycles = int(params.train.total_steps / params.eval.num_steps_per_eval)
    for cycle in range(num_cycles):
      logging.info('Start training cycle %d.', cycle)
      current_cycle_last_train_step = ((cycle + 1)
                                       * params.eval.num_steps_per_eval)
      executor.train(train_input_fn, current_cycle_last_train_step)
      executor.evaluate(
          eval_input_fn,
          params.eval.eval_samples // params.eval.eval_batch_size)

  elif FLAGS.mode == 'predict':
    file_pattern = FLAGS.predict_file_pattern
    if not file_pattern:
        raise ValueError('"predict_file_pattern" parameter is required.')

    output_dir = FLAGS.predict_output_dir
    if not output_dir:
        raise ValueError('"predict_output_dir" parameter is required.')

    test_input_fn = input_reader.InputFn(
        file_pattern, params, mode=ModeKeys.PREDICT_WITH_GT,
        dataset_type=params.eval.eval_dataset_type)

    checkpoint_prefix = 'model.ckpt-' + FLAGS.predict_checkpoint_step
    checkpoint_path = os.path.join(FLAGS.model_dir, checkpoint_prefix)
    if not tf.train.checkpoint_exists(checkpoint_path):
        checkpoint_path = os.path.join(FLAGS.model_dir, 'best_checkpoints', checkpoint_prefix)
        if not tf.train.checkpoint_exists(checkpoint_path):
            raise ValueError('Checkpoint not found: %s/%s' % (FLAGS.model_dir, checkpoint_prefix))

    executor.predict(test_input_fn, checkpoint_path, output_dir=output_dir)

  else:
    logging.info('Mode not found.')
Exemple #4
0
def main(argv):
    del argv  # Unused.

    params = factory.config_generator(FLAGS.model)

    if FLAGS.config_file:
        params = params_dict.override_params_dict(params,
                                                  FLAGS.config_file,
                                                  is_strict=True)

    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)
    params.override({
        'use_tpu': FLAGS.use_tpu,
        'model_dir': FLAGS.model_dir,
    },
                    is_strict=True)
    if not FLAGS.use_tpu:
        params.override(
            {
                'architecture': {
                    'use_bfloat16': False,
                },
                'batch_norm_activation': {
                    'use_sync_bn': False,
                },
            },
            is_strict=True)
    # Only run spatial partitioning in training mode.
    if FLAGS.mode != 'train':
        params.train.input_partition_dims = None
        params.train.num_cores_per_replica = None
    params_to_save = params_dict.ParamsDict(params)
    params.override(
        {
            'platform': {
                'eval_master': FLAGS.eval_master,
                'tpu': FLAGS.tpu,
                'tpu_zone': FLAGS.tpu_zone,
                'gcp_project': FLAGS.gcp_project,
            },
            'tpu_job_name': FLAGS.tpu_job_name,
            'train': {
                'num_shards': FLAGS.num_cores,
            },
        },
        is_strict=False)

    params.validate()
    params.lock()
    pp = pprint.PrettyPrinter()
    params_str = pp.pformat(params.as_dict())
    logging.info('Model Parameters: %s', params_str)

    # Builds detection model on TPUs.
    model_fn = model_builder.ModelFn(params)
    executor = tpu_executor.TpuExecutor(model_fn, params)

    # Prepares input functions for train and eval.
    train_input_fn = input_reader.InputFn(
        params.train.train_file_pattern,
        params,
        mode=ModeKeys.TRAIN,
        dataset_type=params.train.train_dataset_type)
    if params.eval.type == 'customized':
        eval_input_fn = input_reader.InputFn(
            params.eval.eval_file_pattern,
            params,
            mode=ModeKeys.EVAL,
            dataset_type=params.eval.eval_dataset_type)
    else:
        eval_input_fn = input_reader.InputFn(
            params.eval.eval_file_pattern,
            params,
            mode=ModeKeys.PREDICT_WITH_GT,
            dataset_type=params.eval.eval_dataset_type)

    if params.eval.eval_samples:
        eval_times = params.eval.eval_samples // params.eval.eval_batch_size
    else:
        eval_times = None

    # Runs the model.
    if FLAGS.mode == 'train':
        config_utils.save_config(params_to_save, params.model_dir)
        executor.train(train_input_fn, params.train.total_steps)
        if FLAGS.eval_after_training:
            executor.evaluate(eval_input_fn, eval_times)

    elif FLAGS.mode == 'eval':

        def terminate_eval():
            logging.info('Terminating eval after %d seconds of no checkpoints',
                         params.eval.eval_timeout)
            return True

        # Runs evaluation when there's a new checkpoint.
        for ckpt in tf.train.checkpoints_iterator(
                params.model_dir,
                min_interval_secs=params.eval.min_eval_interval,
                timeout=params.eval.eval_timeout,
                timeout_fn=terminate_eval):
            # Terminates eval job when final checkpoint is reached.
            current_step = int(
                six.ensure_str(os.path.basename(ckpt)).split('-')[1])

            logging.info('Starting to evaluate.')
            try:
                executor.evaluate(eval_input_fn, eval_times, ckpt)

                if current_step >= params.train.total_steps:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break
            except tf.errors.NotFoundError as e:
                logging.info(
                    'Erorr occurred during evaluation: NotFoundError: %s', e)

    elif FLAGS.mode == 'train_and_eval':
        config_utils.save_config(params_to_save, params.model_dir)
        num_cycles = int(params.train.total_steps /
                         params.eval.num_steps_per_eval)
        for cycle in range(num_cycles):
            logging.info('Start training cycle %d.', cycle)
            current_cycle_last_train_step = ((cycle + 1) *
                                             params.eval.num_steps_per_eval)
            executor.train(train_input_fn, current_cycle_last_train_step)
            executor.evaluate(eval_input_fn, eval_times)
    else:
        logging.info('Mode not found.')