Beispiel #1
0
def train(sess, loss, x, y, X_train, Y_train, save=False,
          init_all=True, evaluate=None, feed=None, args=None,
          rng=None, var_list=None, fprop_args=None, optimizer=None):
  """
  Train a TF graph.
  This function is not yet deprecated, but is likely to become deprecated
  soon. Prefer cleverhans.train.train when possible.
  cleverhans.train.train supports multiple GPUs but this function is still
  needed to support legacy models that do not support calling fprop more
  than once.

  :param sess: TF session to use when training the graph
  :param loss: tensor, the model training loss.
  :param x: input placeholder
  :param y: output placeholder (for labels)
  :param X_train: numpy array with training inputs
  :param Y_train: numpy array with training outputs
  :param save: boolean controlling the save operation
  :param init_all: (boolean) If set to true, all TF variables in the session
                   are (re)initialized, otherwise only previously
                   uninitialized variables are initialized before training.
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param feed: An optional dictionary that is appended to the feeding
               dictionary before the session runs. Can be used to feed
               the learning phase of a Keras model for instance.
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
               If save is True, should also contain 'train_dir'
               and 'filename'
  :param rng: Instance of numpy.random.RandomState
  :param var_list: Optional list of parameters to train.
  :param fprop_args: dict, extra arguments to pass to fprop (loss and model).
  :param optimizer: Optimizer to be used for training
  :return: True if model trained
  """
  args = _ArgsWrapper(args or {})
  fprop_args = fprop_args or {}

  # Check that necessary arguments were given (see doc above)
  assert args.nb_epochs, "Number of epochs was not given in args dict"
  if optimizer is None:
    assert args.learning_rate is not None, ("Learning rate was not given "
                                            "in args dict")
  assert args.batch_size, "Batch size was not given in args dict"

  if save:
    assert args.train_dir, "Directory for save was not given in args dict"
    assert args.filename, "Filename for save was not given in args dict"

  if rng is None:
    rng = np.random.RandomState()

  # Define optimizer
  loss_value = loss.fprop(x, y, **fprop_args)
  if optimizer is None:
    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
  else:
    if not isinstance(optimizer, tf.train.Optimizer):
      raise ValueError("optimizer object must be from a child class of "
                       "tf.train.Optimizer")
  # Trigger update operations within the default graph (such as batch_norm).
  with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    train_step = optimizer.minimize(loss_value, var_list=var_list)

  with sess.as_default():
    if hasattr(tf, "global_variables_initializer"):
      if init_all:
        tf.global_variables_initializer().run()
      else:
        initialize_uninitialized_global_variables(sess)
    else:
      warnings.warn("Update your copy of tensorflow; future versions of "
                    "CleverHans may drop support for this version.")
      sess.run(tf.initialize_all_variables())

    for epoch in xrange(args.nb_epochs):
      # Compute number of batches
      nb_batches = int(math.ceil(float(len(X_train)) / args.batch_size))
      assert nb_batches * args.batch_size >= len(X_train)

      # Indices to shuffle training set
      index_shuf = list(range(len(X_train)))
      rng.shuffle(index_shuf)

      prev = time.time()
      for batch in range(nb_batches):

        # Compute batch start and end indices
        start, end = batch_indices(
            batch, len(X_train), args.batch_size)

        # Perform one training step
        feed_dict = {x: X_train[index_shuf[start:end]],
                     y: Y_train[index_shuf[start:end]]}
        if feed is not None:
          feed_dict.update(feed)
        train_step.run(feed_dict=feed_dict)
      assert end >= len(X_train)  # Check that all examples were used
      cur = time.time()
      _logger.info("Epoch " + str(epoch) + " took " +
                   str(cur - prev) + " seconds")
      if evaluate is not None:
        evaluate()

    if save:
      save_path = os.path.join(args.train_dir, args.filename)
      saver = tf.train.Saver()
      saver.save(sess, save_path)
      _logger.info("Completed model training and saved at: " +
                   str(save_path))
    else:
      _logger.info("Completed model training.")

  return True
Beispiel #2
0
def model_eval(sess, x, y, predictions, X_test=None, Y_test=None,
               feed=None, args=None):
  """
  Compute the accuracy of a TF model on some data
  :param sess: TF session to use
  :param x: input placeholder
  :param y: output placeholder (for labels)
  :param predictions: model output predictions
  :param X_test: numpy array with training inputs
  :param Y_test: numpy array with training outputs
  :param feed: An optional dictionary that is appended to the feeding
           dictionary before the session runs. Can be used to feed
           the learning phase of a Keras model for instance.
  :param args: dict or argparse `Namespace` object.
               Should contain `batch_size`
  :return: a float with the accuracy value
  """
  args = _ArgsWrapper(args or {})

  assert args.batch_size, "Batch size was not given in args dict"
  if X_test is None or Y_test is None:
    raise ValueError("X_test argument and Y_test argument "
                     "must be supplied.")

  # Define accuracy symbolically
  if LooseVersion(tf.__version__) >= LooseVersion('1.0.0'):
    correct_preds = tf.equal(tf.argmax(y, axis=-1),
                             tf.argmax(predictions, axis=-1))
  else:
    correct_preds = tf.equal(tf.argmax(y, axis=tf.rank(y) - 1),
                             tf.argmax(predictions,
                                       axis=tf.rank(predictions) - 1))

  # Init result var
  accuracy = 0.0

  with sess.as_default():
    # Compute number of batches
    nb_batches = int(math.ceil(float(len(X_test)) / args.batch_size))
    assert nb_batches * args.batch_size >= len(X_test)

    X_cur = np.zeros((args.batch_size,) + X_test.shape[1:],
                     dtype=X_test.dtype)
    Y_cur = np.zeros((args.batch_size,) + Y_test.shape[1:],
                     dtype=Y_test.dtype)
    for batch in range(nb_batches):
      if batch % 100 == 0 and batch > 0:
        _logger.debug("Batch " + str(batch))

      # Must not use the `batch_indices` function here, because it
      # repeats some examples.
      # It's acceptable to repeat during training, but not eval.
      start = batch * args.batch_size
      end = min(len(X_test), start + args.batch_size)

      # The last batch may be smaller than all others. This should not
      # affect the accuarcy disproportionately.
      cur_batch_size = end - start
      X_cur[:cur_batch_size] = X_test[start:end]
      Y_cur[:cur_batch_size] = Y_test[start:end]
      feed_dict = {x: X_cur, y: Y_cur}
      if feed is not None:
        feed_dict.update(feed)
      cur_corr_preds = correct_preds.eval(feed_dict=feed_dict)

      accuracy += cur_corr_preds[:cur_batch_size].sum()

    assert end >= len(X_test)

    # Divide by number of examples to get final value
    accuracy /= len(X_test)

  return accuracy
Beispiel #3
0
def model_train(sess, x, y, predictions, X_train, Y_train, save=False,
                predictions_adv=None, init_all=True, evaluate=None,
                feed=None, args=None, rng=None, var_list=None):
  """
  Train a TF graph
  :param sess: TF session to use when training the graph
  :param x: input placeholder
  :param y: output placeholder (for labels)
  :param predictions: model output predictions
  :param X_train: numpy array with training inputs
  :param Y_train: numpy array with training outputs
  :param save: boolean controlling the save operation
  :param predictions_adv: if set with the adversarial example tensor,
                          will run adversarial training
  :param init_all: (boolean) If set to true, all TF variables in the session
                   are (re)initialized, otherwise only previously
                   uninitialized variables are initialized before training.
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param feed: An optional dictionary that is appended to the feeding
               dictionary before the session runs. Can be used to feed
               the learning phase of a Keras model for instance.
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
               If save is True, should also contain 'train_dir'
               and 'filename'
  :param rng: Instance of numpy.random.RandomState
  :param var_list: Optional list of parameters to train.
  :return: True if model trained
  """
  warnings.warn('This function is deprecated.')
  args = _ArgsWrapper(args or {})

  # Check that necessary arguments were given (see doc above)
  assert args.nb_epochs, "Number of epochs was not given in args dict"
  assert args.learning_rate, "Learning rate was not given in args dict"
  assert args.batch_size, "Batch size was not given in args dict"

  if save:
    assert args.train_dir, "Directory for save was not given in args dict"
    assert args.filename, "Filename for save was not given in args dict"

  if rng is None:
    rng = np.random.RandomState()

  # Define loss
  loss = model_loss(y, predictions)
  if predictions_adv is not None:
    loss = (loss + model_loss(y, predictions_adv)) / 2

  train_step = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
  train_step = train_step.minimize(loss, var_list=var_list)

  with sess.as_default():
    if hasattr(tf, "global_variables_initializer"):
      if init_all:
        tf.global_variables_initializer().run()
      else:
        initialize_uninitialized_global_variables(sess)
    else:
      warnings.warn("Update your copy of tensorflow; future versions of "
                    "CleverHans may drop support for this version.")
      sess.run(tf.initialize_all_variables())

    for epoch in xrange(args.nb_epochs):
      # Compute number of batches
      nb_batches = int(math.ceil(float(len(X_train)) / args.batch_size))
      assert nb_batches * args.batch_size >= len(X_train)

      # Indices to shuffle training set
      index_shuf = list(range(len(X_train)))
      rng.shuffle(index_shuf)

      prev = time.time()
      for batch in range(nb_batches):

        # Compute batch start and end indices
        start, end = batch_indices(
            batch, len(X_train), args.batch_size)

        # Perform one training step
        feed_dict = {x: X_train[index_shuf[start:end]],
                     y: Y_train[index_shuf[start:end]]}
        if feed is not None:
          feed_dict.update(feed)
        train_step.run(feed_dict=feed_dict)
      assert end >= len(X_train)  # Check that all examples were used
      cur = time.time()
      _logger.info("Epoch " + str(epoch) + " took " +
                   str(cur - prev) + " seconds")
      if evaluate is not None:
        evaluate()

    if save:
      save_path = os.path.join(args.train_dir, args.filename)
      saver = tf.train.Saver()
      saver.save(sess, save_path)
      _logger.info("Completed model training and saved at: " +
                   str(save_path))
    else:
      _logger.info("Completed model training.")

  return True
Beispiel #4
0
def train(sess,
          loss,
          x_train,
          y_train,
          init_all=True,
          evaluate=None,
          feed=None,
          args=None,
          rng=None,
          var_list=None,
          fprop_args=None,
          optimizer=None,
          devices=None,
          x_batch_preprocessor=None,
          use_ema=False,
          ema_decay=.998,
          run_canary=True,
          loss_threshold=1e5):
    """
  Run (optionally multi-replica, synchronous) training to minimize `loss`
  :param sess: TF session to use when training the graph
  :param loss: tensor, the loss to minimize
  :param x_train: numpy array with training inputs
  :param y_train: numpy array with training outputs
  :param init_all: (boolean) If set to true, all TF variables in the session
                   are (re)initialized, otherwise only previously
                   uninitialized variables are initialized before training.
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param feed: An optional dictionary that is appended to the feeding
               dictionary before the session runs. Can be used to feed
               the learning phase of a Keras model for instance.
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
  :param rng: Instance of numpy.random.RandomState
  :param var_list: Optional list of parameters to train.
  :param fprop_args: dict, extra arguments to pass to fprop (loss and model).
  :param optimizer: Optimizer to be used for training
  :param devices: list of device names to use for training
      If None, defaults to: all GPUs, if GPUs are available
                            all devices, if no GPUs are available
  :param x_batch_preprocessor: callable
      Takes a single tensor containing an x_train batch as input
      Returns a single tensor containing an x_train batch as output
      Called to preprocess the data before passing the data to the Loss
  :param use_ema: bool
      If true, uses an exponential moving average of the model parameters
  :param ema_decay: float or callable
      The decay parameter for EMA, if EMA is used
      If a callable rather than a float, this is a callable that takes
      the epoch and batch as arguments and returns the ema_decay for
      the current batch.
  :param run_canary: bool
      If True and using 3 or more GPUs, runs some canary code that should
      fail if there is a multi-GPU driver problem.
      Turn this off if your gradients are inherently stochastic (e.g.
      if you use dropout). The canary code checks that all GPUs give
      approximately the same gradient.
  :param loss_threshold: float
      Raise an exception if the loss exceeds this value.
      This is intended to rapidly detect numerical problems.
      Sometimes the loss may legitimately be higher than this value. In
      such cases, raise the value. If needed it can be np.inf.
  :return: True if model trained
  """
    args = _ArgsWrapper(args or {})
    fprop_args = fprop_args or {}

    # Check that necessary arguments were given (see doc above)
    assert args.nb_epochs, "Number of epochs was not given in args dict"
    if optimizer is None:
        if args.learning_rate is None:
            raise ValueError("Learning rate was not given in args dict")
    assert args.batch_size, "Batch size was not given in args dict"

    if rng is None:
        rng = np.random.RandomState()

    if optimizer is None:
        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    else:
        if not isinstance(optimizer, tf.train.Optimizer):
            raise ValueError("optimizer object must be from a child class of "
                             "tf.train.Optimizer")

    grads = []
    xs = []
    preprocessed_xs = []
    ys = []

    devices = infer_devices(devices)
    for idx, device in enumerate(devices):
        with tf.device(device):
            x = tf.placeholder(x_train.dtype, (None, ) + x_train.shape[1:])
            y = tf.placeholder(x_train.dtype, (None, ) + y_train.shape[1:])
            xs.append(x)
            ys.append(y)

            if x_batch_preprocessor is not None:
                x = x_batch_preprocessor(x)

            # We need to keep track of these so that the canary can feed
            # preprocessed values. If the canary had to feed raw values,
            # stochastic preprocessing could make the canary fail.
            preprocessed_xs.append(x)

            loss_value = loss.fprop(x, y, **fprop_args)

            grads.append(
                optimizer.compute_gradients(loss_value, var_list=var_list))
    num_devices = len(devices)
    print("num_devices: ", num_devices)

    grad = avg_grads(grads)
    # Trigger update operations within the default graph (such as batch_norm).
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_step = optimizer.apply_gradients(grad)

    epoch_tf = tf.placeholder(tf.int32, [])
    batch_tf = tf.placeholder(tf.int32, [])

    if use_ema:
        if callable(ema_decay):
            ema_decay = ema_decay(epoch_tf, batch_tf)
        ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
        with tf.control_dependencies([train_step]):
            train_step = ema.apply(var_list)
        # Get pointers to the EMA's running average variables
        avg_params = [ema.average(param) for param in var_list]
        # Make temporary buffers used for swapping the live and running average
        # parameters
        tmp_params = [
            tf.Variable(param, trainable=False) for param in var_list
        ]
        # Define the swapping operation
        param_to_tmp = [
            tf.assign(tmp, param)
            for tmp, param in safe_zip(tmp_params, var_list)
        ]
        with tf.control_dependencies(param_to_tmp):
            avg_to_param = [
                tf.assign(param, avg)
                for param, avg in safe_zip(var_list, avg_params)
            ]
        with tf.control_dependencies(avg_to_param):
            tmp_to_avg = [
                tf.assign(avg, tmp)
                for avg, tmp in safe_zip(avg_params, tmp_params)
            ]
        swap = tmp_to_avg

    batch_size = args.batch_size

    assert batch_size % num_devices == 0
    device_batch_size = batch_size // num_devices

    if init_all:
        sess.run(tf.global_variables_initializer())
    else:
        initialize_uninitialized_global_variables(sess)

    # Check whether the hardware is working correctly

    # So far the failure has only been observed with 3 or more GPUs
    run_canary = run_canary and num_devices > 2
    if run_canary:
        canary_feed_dict = {}
        for x, y in safe_zip(preprocessed_xs, ys):
            canary_feed_dict[x] = x_train[:device_batch_size].copy()
            canary_feed_dict[y] = y_train[:device_batch_size].copy()
        # To reduce the runtime and memory cost of this canary,
        # we test the gradient of only one parameter.
        # For now this is just set to the first parameter in the list,
        # because it is an index that is always guaranteed to work.
        # If we think that this is causing false negatives and we should
        # test other parameters, we could test a random parameter from
        # the list or we could rewrite the canary to examine more than
        # one parameter.
        param_to_test = 0
        grad_vars = []
        for i in xrange(num_devices):
            dev_grads = grads[i]
            grad_vars.append(dev_grads[param_to_test][0])
        grad_values = sess.run(grad_vars, feed_dict=canary_feed_dict)
        failed = False
        for i in xrange(1, num_devices):
            if grad_values[0].shape != grad_values[i].shape:
                print("shape 0 does not match shape %d:" % i,
                      grad_values[0].shape, grad_values[i].shape)
                failed = True
                continue
            if not np.allclose(grad_values[0], grad_values[i], atol=1e-6):
                print("grad_values[0]: ", grad_values[0].mean(),
                      grad_values[0].max())
                print("grad_values[%d]: " % i, grad_values[i].mean(),
                      grad_values[i].max())
                print("max diff: ",
                      np.abs(grad_values[0] - grad_values[1]).max())
                failed = True
        if failed:
            print("Canary failed.")
            quit()

    for epoch in xrange(args.nb_epochs):
        # Indices to shuffle training set
        index_shuf = list(range(len(x_train)))
        # Randomly repeat a few training examples each epoch to avoid
        # having a too-small batch
        while len(index_shuf) % batch_size != 0:
            index_shuf.append(rng.randint(len(x_train)))
        nb_batches = len(index_shuf) // batch_size
        rng.shuffle(index_shuf)
        # Shuffling here versus inside the loop doesn't seem to affect
        # timing very much, but shuffling here makes the code slightly
        # easier to read
        x_train_shuffled = x_train[index_shuf]
        y_train_shuffled = y_train[index_shuf]

        prev = time.time()
        for batch in range(nb_batches):

            # Compute batch start and end indices
            start = batch * batch_size
            end = (batch + 1) * batch_size

            # Perform one training step
            feed_dict = {epoch_tf: epoch, batch_tf: batch}
            diff = end - start
            assert diff == batch_size
            for dev_idx in xrange(num_devices):
                cur_start = start + dev_idx * device_batch_size
                cur_end = start + (dev_idx + 1) * device_batch_size
                feed_dict[xs[dev_idx]] = x_train_shuffled[cur_start:cur_end]
                feed_dict[ys[dev_idx]] = y_train_shuffled[cur_start:cur_end]
            if cur_end != end:
                msg = ("batch_size (%d) must be a multiple of num_devices "
                       "(%d).\nCUDA_VISIBLE_DEVICES: %s"
                       "\ndevices: %s")
                args = (batch_size, num_devices,
                        os.environ['CUDA_VISIBLE_DEVICES'], str(devices))
                raise ValueError(msg % args)
            if feed is not None:
                feed_dict.update(feed)

            _, loss_numpy = sess.run([train_step, loss_value],
                                     feed_dict=feed_dict)

            if np.abs(loss_numpy) > loss_threshold:
                raise ValueError("Extreme loss during training: ", loss_numpy)
            if np.isnan(loss_numpy) or np.isinf(loss_numpy):
                raise ValueError("NaN/Inf loss during training")
        assert end == len(index_shuf)  # Check that all examples were used
        cur = time.time()
        _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) +
                     " seconds")
        if evaluate is not None:
            if use_ema:
                # Before running evaluation, load the running average
                # parameters into the live slot, so we can see how well
                # the EMA parameters are performing
                sess.run(swap)
            evaluate()
            if use_ema:
                # Swap the parameters back, so that we continue training
                # on the live parameters
                sess.run(swap)
    if use_ema:
        # When training is done, swap the running average parameters into
        # the live slot, so that we use them when we deploy the model
        sess.run(swap)

    return True