예제 #1
0
    def request_examples(self, attack_config, criteria, run_counts,
                         batch_size):
        wrong_confidence = criteria['wrong_confidence']
        below_t = wrong_confidence <= self.t
        assert below_t.dtype == np.bool
        total = below_t.size
        total_below = below_t.sum()
        all_idxs = np.arange(total)
        run_counts = run_counts[attack_config]
        if total_below > 0:
            correct_idxs = all_idxs[below_t]
            assert correct_idxs.size == total_below
            run_counts = run_counts[below_t]
            pairs = safe_zip(correct_idxs, run_counts)
        else:
            pairs = safe_zip(all_idxs, run_counts)

        def key(pair):
            return pair[1]

        pairs.sort(key=key)
        idxs = [pair[0] for pair in pairs]
        while len(idxs) < batch_size:
            needed = batch_size - len(idxs)
            idxs = idxs + idxs[:needed]
        if len(idxs) > batch_size:
            idxs = idxs[:batch_size]
        idxs = np.array(idxs)
        return idxs
예제 #2
0
    def request_examples(self, attack_config, criteria, run_counts,
                         batch_size):
        correctness = criteria['correctness']
        assert correctness.dtype == np.bool
        total = correctness.size
        total_correct = correctness.sum()
        all_idxs = np.arange(total)
        run_counts = run_counts[attack_config]
        if total_correct > 0:
            correct_idxs = all_idxs[correctness]
            assert correct_idxs.size == total_correct
            run_counts = run_counts[correctness]
            pairs = safe_zip(correct_idxs, run_counts)
        else:
            pairs = safe_zip(all_idxs, run_counts)
        # In PY3, pairs is now an iterator.
        # To support sorting, we need to make it a list.
        pairs = list(pairs)

        def key(pair):
            return pair[1]

        pairs.sort(key=key)
        idxs = [pair[0] for pair in pairs]
        while len(idxs) < batch_size:
            needed = batch_size - len(idxs)
            idxs = idxs + idxs[:needed]
        if len(idxs) > batch_size:
            idxs = idxs[:batch_size]
        idxs = np.array(idxs)
        return idxs
예제 #3
0
    def __setstate__(self, d):
        tf_variables = d[VARS]
        del d[VARS]
        tf_variable_names = None
        # older joblib files may not have "_tf_variable_names"
        if VAR_NAMES in d:
            tf_variable_names = d[VAR_NAMES]
            del d[VAR_NAMES]
        else:
            warnings.warn(
                "This joblib file has no " + VAR_NAMES + " field. "
                "The field may become required on or after 2019-04-11."
                "You can make your file compatible with the new format by"
                " loading the file and re-saving it.")
        # Deserialize everything except the Variables
        self.__dict__ = d
        # Deserialize the Variables
        sess = tf.get_default_session()
        if sess is None:
            raise RuntimeError("NoRefModel requires a default "
                               "TensorFlow session")
        cur_vars = self.get_vars()
        if len(cur_vars) != len(tf_variables):
            print("Model format mismatch")
            print("Current model has " + str(len(cur_vars)) + " variables")
            print("Saved model has " + str(len(tf_variables)) + " variables")
            print("Names of current vars:")
            for var in cur_vars:
                print("\t" + var.name)
            if tf_variable_names is not None:
                print("Names of saved vars:")
                for name in tf_variable_names:
                    print("\t" + name)
            else:
                print("Saved vars use old format, no names available for them")
            assert False

        found = [False] * len(cur_vars)
        if tf_variable_names is not None:
            # New version using the names to handle changes in ordering
            for value, name in safe_zip(tf_variables, tf_variable_names):
                value_found = False
                for idx, cur_var in enumerate(cur_vars):
                    if cur_var.name == name:
                        assert not found[idx]
                        value_found = True
                        found[idx] = True
                        cur_var.load(value, sess)
                        break
                assert value_found
            assert all(found)
        else:
            # Old version that works if and only if the order doesn't change
            for var, value in safe_zip(cur_vars, tf_variables):
                var.load(value, sess)
예제 #4
0
  def fprop(self, x, y, **kwargs):
    kwargs.update(self.kwargs)
    if self.attack is not None:
      attack_params = copy.copy(self.attack_params)
      if attack_params is None:
        attack_params = {}
      if self.pass_y:
        attack_params['y'] = y
      x = x, self.attack.generate(x, **attack_params)
      coeffs = [1. - self.adv_coeff, self.adv_coeff]
      if self.adv_coeff == 1.:
        x = (x[1],)
        coeffs = (coeffs[1],)
    else:
      x = tuple([x])
      coeffs = [1.]
    assert np.allclose(sum(coeffs), 1.)

    # Catching RuntimeError: Variable -= value not supported by tf.eager.
    try:
      y -= self.smoothing * (y - 1. / tf.cast(y.shape[-1], y.dtype))
    except RuntimeError:
      y.assign_sub(self.smoothing * (y - 1. / tf.cast(y.shape[-1],
                                                      y.dtype)))

    logits = [self.model.get_logits(x, **kwargs) for x in x]
    loss = sum(
        coeff * tf.reduce_mean(softmax_cross_entropy_with_logits(labels=y,
                                                                 logits=logit))
        for coeff, logit in safe_zip(coeffs, logits))
    return loss
예제 #5
0
  def fprop(self, x, y, **kwargs):
    weights, loss_objects = safe_zip(*self.terms)
    for weight in weights:
      if isinstance(weight, float):
        continue
      if hasattr(weight, 'ndim'):
        assert weight.ndim == 0
        continue
      raise TypeError("weight of %s is not a type that this function "
                      "knows it can accept yet" % str(weight))
    losses = [loss.fprop(x, y, **kwargs) for loss in loss_objects]
    for loss, loss_object in safe_zip(losses, loss_objects):
      if len(loss.get_shape()) > 0:
        raise ValueError("%s.fprop returned a non-scalar value" %
                         str(loss_object))
    terms = [weight * loss for weight, loss in safe_zip(weights, losses)]

    return tf.add_n(terms)
예제 #6
0
 def __setstate__(self, d):
     tf_variables = d["_tf_variables"]
     del d["_tf_variables"]
     # Deserialize everything except the Variables
     self.__dict__ = d
     # Deserialize the Variables
     sess = tf.get_default_session()
     if sess is None:
         raise RuntimeError("NoRefModel requires a default "
                            "TensorFlow session")
     for var, value in safe_zip(self.get_params(), tf_variables):
         var.load(value, sess)
예제 #7
0
    def fprop(self, x, x_t, **kwargs):
        #kwargs.update(self.kwargs)
        if self.attack is not None:
            attack_params = copy.copy(self.attack_params)
            if attack_params is None:
                attack_params = {}

            x = x, self.attack.generate(x, x_t, **attack_params)
            x_orig = x, x
            print("shape of x in loss_fprop: ", np.shape(x))

            coeffs = [1. - self.adv_coeff, self.adv_coeff]
            if self.adv_coeff == 1.:
                x = (x[1], )
                x_orig = (x_orig[1])
                coeffs = (coeffs[1], )
        else:
            x = tuple([x])
            x_orig = tuple([x])
            coeffs = [1.]
        assert np.allclose(sum(coeffs), 1.)

        #recon = [self.model.get_layer(x, 'RECON') for x in x]
        recon_layer_name = self.model.get_layer_names()[-1]
        recon = [self.model.get_layer(x, recon_layer_name) for x in x]

        #print("layer names: ",self.model.get_layer_names())

        #uncomment for CIFAR10
        #img_rows = img_cols = 32
        #nchannels = 3

        #x_ph=tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
        #                                   nchannels))
        #recon = [self.model.get_layer(x,"activation_7") for x in x]

        loss = sum(coeff * tf.reduce_sum(tf.squared_difference(x_orig, recon))
                   for coeff, x_orig, recon in safe_zip(coeffs, x_orig, recon))

        return (loss / (tf.to_float(tf.shape(x_orig)[0])))
예제 #8
0
  def get_criteria(self, sess, model, advx, y):
    """
    Returns a dictionary mapping the name of each criterion to a NumPy
    array containing the value of that criterion for each adversarial
    example.
    Subclasses can add extra criteria by implementing the `extra_criteria`
    method.

    :param sess: tf.session.Session
    :param model: cleverhans.model.Model
    :param adv_x: numpy array containing the adversarial examples made so far
      by earlier work in the bundling process
    :param y: numpy array containing true labels
    """

    names, factory = self.extra_criteria()
    factory = _CriteriaFactory(model, factory)
    results = batch_eval_multi_worker(sess, factory, [advx, y],
                                      batch_size=BATCH_SIZE, devices=devices)
    names = ['correctness', 'confidence'] + names
    out = dict(safe_zip(names, results))
    return out
def train(sess, loss, x_train, y_train,
          init_all=False, evaluate=None, feed=None, args=None,
          rng=None, var_list=None, fprop_args=None, optimizer=None,
          devices=None, x_batch_preprocessor=None, use_ema=False,
          ema_decay=.998, run_canary=None,
          loss_threshold=1e5, dataset_train=None, dataset_size=None):
  """
  Run (optionally multi-replica, synchronous) training to minimize `loss`
  :param sess: TF session to use when training the graph
  :param loss: tensor, the loss to minimize
  :param x_train: numpy array with training inputs or tf Dataset
  :param y_train: numpy array with training outputs or tf Dataset
  :param init_all: (boolean) If set to true, all TF variables in the session
                   are (re)initialized, otherwise only previously
                   uninitialized variables are initialized before training.
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param feed: An optional dictionary that is appended to the feeding
               dictionary before the session runs. Can be used to feed
               the learning phase of a Keras model for instance.
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
  :param rng: Instance of numpy.random.RandomState
  :param var_list: Optional list of parameters to train.
  :param fprop_args: dict, extra arguments to pass to fprop (loss and model).
  :param optimizer: Optimizer to be used for training
  :param devices: list of device names to use for training
      If None, defaults to: all GPUs, if GPUs are available
                            all devices, if no GPUs are available
  :param x_batch_preprocessor: callable
      Takes a single tensor containing an x_train batch as input
      Returns a single tensor containing an x_train batch as output
      Called to preprocess the data before passing the data to the Loss
  :param use_ema: bool
      If true, uses an exponential moving average of the model parameters
  :param ema_decay: float or callable
      The decay parameter for EMA, if EMA is used
      If a callable rather than a float, this is a callable that takes
      the epoch and batch as arguments and returns the ema_decay for
      the current batch.
  :param loss_threshold: float
      Raise an exception if the loss exceeds this value.
      This is intended to rapidly detect numerical problems.
      Sometimes the loss may legitimately be higher than this value. In
      such cases, raise the value. If needed it can be np.inf.
  :param dataset_train: tf Dataset instance.
      Used as a replacement for x_train, y_train for faster performance.
    :param dataset_size: integer, the size of the dataset_train.
  :return: True if model trained
  """

  # Check whether the hardware is working correctly
  canary.run_canary()
  if run_canary is not None:
    warnings.warn("The `run_canary` argument is deprecated. The canary "
                  "is now much cheaper and thus runs all the time. The "
                  "canary now uses its own loss function so it is not "
                  "necessary to turn off the canary when training with "
                  " a stochastic loss. Simply quit passing `run_canary`."
                  "Passing `run_canary` may become an error on or after "
                  "2019-10-16.")

  args = _ArgsWrapper(args or {})
  fprop_args = fprop_args or {}

  # Check that necessary arguments were given (see doc above)
  # Be sure to support 0 epochs for debugging purposes
  if args.nb_epochs is None:
    raise ValueError("`args` must specify number of epochs")
  if optimizer is None:
    if args.learning_rate is None:
      raise ValueError("Learning rate was not given in args dict")
  assert args.batch_size, "Batch size was not given in args dict"

  if rng is None:
    rng = np.random.RandomState()

  if optimizer is None:
    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
  else:
    if not isinstance(optimizer, tf.train.Optimizer):
      raise ValueError("optimizer object must be from a child class of "
                       "tf.train.Optimizer")

  grads = []
  xs = []
  preprocessed_xs = []
  ys = []
  if dataset_train is not None:
    assert x_train is None and y_train is None and x_batch_preprocessor is None
    if dataset_size is None:
      raise ValueError("You must provide a dataset size")
    data_iterator = dataset_train.make_one_shot_iterator().get_next()
    x_train, y_train = sess.run(data_iterator)

  devices = infer_devices(devices)
  for device in devices:
    with tf.device(device):
      # x = tf.placeholder(x_train.dtype, (None,) + x_train.shape[1:])
      # y = tf.placeholder(y_train.dtype, (None,) + y_train.shape[1:])
      x = tf.placeholder(tf.float32, (None,) + x_train.shape[1:])
      y = tf.placeholder(tf.float32, (None,) + y_train.shape[1:])
      xs.append(x)
      ys.append(y)

      if x_batch_preprocessor is not None:
        x = x_batch_preprocessor(x)

      # We need to keep track of these so that the canary can feed
      # preprocessed values. If the canary had to feed raw values,
      # stochastic preprocessing could make the canary fail.
      preprocessed_xs.append(x)

      loss_value = loss.fprop(x, y, **fprop_args)
      print("loss_value", loss_value)
      grads.append(optimizer.compute_gradients(
          loss_value, var_list=var_list))
      print("grads:", grads)
  num_devices = len(devices)
  print("num_devices: ", num_devices)

  grad = avg_grads(grads)
  # Trigger update operations within the default graph (such as batch_norm).
  with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    train_step = optimizer.apply_gradients(grad)

  epoch_tf = tf.placeholder(tf.int32, [])
  batch_tf = tf.placeholder(tf.int32, [])

  if use_ema:
    if callable(ema_decay):
      ema_decay = ema_decay(epoch_tf, batch_tf)
    ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
    with tf.control_dependencies([train_step]):
      train_step = ema.apply(var_list)
    # Get pointers to the EMA's running average variables
    avg_params = [ema.average(param) for param in var_list]
    # Make temporary buffers used for swapping the live and running average
    # parameters
    tmp_params = [tf.Variable(param, trainable=False)
                  for param in var_list]
    # Define the swapping operation
    param_to_tmp = [tf.assign(tmp, param)
                    for tmp, param in safe_zip(tmp_params, var_list)]
    with tf.control_dependencies(param_to_tmp):
      avg_to_param = [tf.assign(param, avg)
                      for param, avg in safe_zip(var_list, avg_params)]
    with tf.control_dependencies(avg_to_param):
      tmp_to_avg = [tf.assign(avg, tmp)
                    for avg, tmp in safe_zip(avg_params, tmp_params)]
    swap = tmp_to_avg

  batch_size = args.batch_size

  assert batch_size % num_devices == 0
  device_batch_size = batch_size // num_devices

  if init_all:
    sess.run(tf.global_variables_initializer())
  else:
    initialize_uninitialized_global_variables(sess)

  for epoch in xrange(args.nb_epochs):
    if dataset_train is not None:
      nb_batches = int(math.ceil(float(dataset_size) / batch_size))
    else:
      # Indices to shuffle training set
      index_shuf = list(range(len(x_train)))
      # Randomly repeat a few training examples each epoch to avoid
      # having a too-small batch
      while len(index_shuf) % batch_size != 0:
        index_shuf.append(rng.randint(len(x_train)))
      nb_batches = len(index_shuf) // batch_size
      rng.shuffle(index_shuf)
      # Shuffling here versus inside the loop doesn't seem to affect
      # timing very much, but shuffling here makes the code slightly
      # easier to read
      x_train_shuffled = x_train[index_shuf]
      y_train_shuffled = y_train[index_shuf]

    prev = time.time()
    for batch in range(nb_batches):
      if dataset_train is not None:
        x_train_shuffled, y_train_shuffled = sess.run(data_iterator)
        start, end = 0, batch_size
      else:
        # Compute batch start and end indices
        start = batch * batch_size
        end = (batch + 1) * batch_size
        # Perform one training step
        diff = end - start
        assert diff == batch_size

      feed_dict = {epoch_tf: epoch, batch_tf: batch}
      for dev_idx in xrange(num_devices):
        cur_start = start + dev_idx * device_batch_size
        cur_end = start + (dev_idx + 1) * device_batch_size
        feed_dict[xs[dev_idx]] = x_train_shuffled[cur_start:cur_end]
        feed_dict[ys[dev_idx]] = y_train_shuffled[cur_start:cur_end]
      if cur_end != end and dataset_train is None:
        msg = ("batch_size (%d) must be a multiple of num_devices "
               "(%d).\nCUDA_VISIBLE_DEVICES: %s"
               "\ndevices: %s")
        args = (batch_size, num_devices,
                os.environ['CUDA_VISIBLE_DEVICES'],
                str(devices))
        raise ValueError(msg % args)
      if feed is not None:
        feed_dict.update(feed)

      _, loss_numpy = sess.run([train_step, loss_value], feed_dict=feed_dict)

      if np.abs(loss_numpy) > loss_threshold:
        raise ValueError("Extreme loss during training: ", loss_numpy)
      if np.isnan(loss_numpy) or np.isinf(loss_numpy):
        raise ValueError("NaN/Inf loss during training")
    assert (dataset_train is not None or end == len(index_shuf))  # Check that all examples were used
    cur = time.time()
    _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) + " seconds")
    print("loss:", loss_numpy)
    if evaluate is not None:
      if use_ema:
        # Before running evaluation, load the running average
        # parameters into the live slot, so we can see how well
        # the EMA parameters are performing
        sess.run(swap)
      evaluate()
      if use_ema:
        # Swap the parameters back, so that we continue training
        # on the live parameters
        sess.run(swap)
  if use_ema:
    # When training is done, swap the running average parameters into
    # the live slot, so that we use them when we deploy the model
    sess.run(swap)

  return True
예제 #10
0
def train(sess,
          loss,
          x_train,
          y_train,
          init_all=True,
          evaluate=None,
          feed=None,
          args=None,
          rng=None,
          var_list=None,
          fprop_args=None,
          optimizer=None,
          devices=None,
          x_batch_preprocessor=None,
          use_ema=False,
          ema_decay=.998,
          run_canary=True,
          loss_threshold=1e5):
    """
  Run (optionally multi-replica, synchronous) training to minimize `loss`
  :param sess: TF session to use when training the graph
  :param loss: tensor, the loss to minimize
  :param x_train: numpy array with training inputs
  :param y_train: numpy array with training outputs
  :param init_all: (boolean) If set to true, all TF variables in the session
                   are (re)initialized, otherwise only previously
                   uninitialized variables are initialized before training.
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param feed: An optional dictionary that is appended to the feeding
               dictionary before the session runs. Can be used to feed
               the learning phase of a Keras model for instance.
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
  :param rng: Instance of numpy.random.RandomState
  :param var_list: Optional list of parameters to train.
  :param fprop_args: dict, extra arguments to pass to fprop (loss and model).
  :param optimizer: Optimizer to be used for training
  :param devices: list of device names to use for training
      If None, defaults to: all GPUs, if GPUs are available
                            all devices, if no GPUs are available
  :param x_batch_preprocessor: callable
      Takes a single tensor containing an x_train batch as input
      Returns a single tensor containing an x_train batch as output
      Called to preprocess the data before passing the data to the Loss
  :param use_ema: bool
      If true, uses an exponential moving average of the model parameters
  :param ema_decay: float or callable
      The decay parameter for EMA, if EMA is used
      If a callable rather than a float, this is a callable that takes
      the epoch and batch as arguments and returns the ema_decay for
      the current batch.
  :param run_canary: bool
      If True and using 3 or more GPUs, runs some canary code that should
      fail if there is a multi-GPU driver problem.
      Turn this off if your gradients are inherently stochastic (e.g.
      if you use dropout). The canary code checks that all GPUs give
      approximately the same gradient.
  :param loss_threshold: float
      Raise an exception if the loss exceeds this value.
      This is intended to rapidly detect numerical problems.
      Sometimes the loss may legitimately be higher than this value. In
      such cases, raise the value. If needed it can be np.inf.
  :return: True if model trained
  """
    args = _ArgsWrapper(args or {})
    fprop_args = fprop_args or {}

    # Check that necessary arguments were given (see doc above)
    assert args.nb_epochs, "Number of epochs was not given in args dict"
    if optimizer is None:
        if args.learning_rate is None:
            raise ValueError("Learning rate was not given in args dict")
    assert args.batch_size, "Batch size was not given in args dict"

    if rng is None:
        rng = np.random.RandomState()

    if optimizer is None:
        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    else:
        if not isinstance(optimizer, tf.train.Optimizer):
            raise ValueError("optimizer object must be from a child class of "
                             "tf.train.Optimizer")

    grads = []
    xs = []
    preprocessed_xs = []
    ys = []

    devices = infer_devices(devices)
    for idx, device in enumerate(devices):
        with tf.device(device):
            x = tf.placeholder(x_train.dtype, (None, ) + x_train.shape[1:])
            y = tf.placeholder(x_train.dtype, (None, ) + y_train.shape[1:])
            xs.append(x)
            ys.append(y)

            if x_batch_preprocessor is not None:
                x = x_batch_preprocessor(x)

            # We need to keep track of these so that the canary can feed
            # preprocessed values. If the canary had to feed raw values,
            # stochastic preprocessing could make the canary fail.
            preprocessed_xs.append(x)

            loss_value = loss.fprop(x, y, **fprop_args)

            grads.append(
                optimizer.compute_gradients(loss_value, var_list=var_list))
    num_devices = len(devices)
    print("num_devices: ", num_devices)

    grad = avg_grads(grads)
    # Trigger update operations within the default graph (such as batch_norm).
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_step = optimizer.apply_gradients(grad)

    epoch_tf = tf.placeholder(tf.int32, [])
    batch_tf = tf.placeholder(tf.int32, [])

    if use_ema:
        if callable(ema_decay):
            ema_decay = ema_decay(epoch_tf, batch_tf)
        ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
        with tf.control_dependencies([train_step]):
            train_step = ema.apply(var_list)
        # Get pointers to the EMA's running average variables
        avg_params = [ema.average(param) for param in var_list]
        # Make temporary buffers used for swapping the live and running average
        # parameters
        tmp_params = [
            tf.Variable(param, trainable=False) for param in var_list
        ]
        # Define the swapping operation
        param_to_tmp = [
            tf.assign(tmp, param)
            for tmp, param in safe_zip(tmp_params, var_list)
        ]
        with tf.control_dependencies(param_to_tmp):
            avg_to_param = [
                tf.assign(param, avg)
                for param, avg in safe_zip(var_list, avg_params)
            ]
        with tf.control_dependencies(avg_to_param):
            tmp_to_avg = [
                tf.assign(avg, tmp)
                for avg, tmp in safe_zip(avg_params, tmp_params)
            ]
        swap = tmp_to_avg

    batch_size = args.batch_size

    assert batch_size % num_devices == 0
    device_batch_size = batch_size // num_devices

    if init_all:
        sess.run(tf.global_variables_initializer())
    else:
        initialize_uninitialized_global_variables(sess)

    # Check whether the hardware is working correctly

    # So far the failure has only been observed with 3 or more GPUs
    run_canary = run_canary and num_devices > 2
    if run_canary:
        canary_feed_dict = {}
        for x, y in safe_zip(preprocessed_xs, ys):
            canary_feed_dict[x] = x_train[:device_batch_size].copy()
            canary_feed_dict[y] = y_train[:device_batch_size].copy()
        # To reduce the runtime and memory cost of this canary,
        # we test the gradient of only one parameter.
        # For now this is just set to the first parameter in the list,
        # because it is an index that is always guaranteed to work.
        # If we think that this is causing false negatives and we should
        # test other parameters, we could test a random parameter from
        # the list or we could rewrite the canary to examine more than
        # one parameter.
        param_to_test = 0
        grad_vars = []
        for i in xrange(num_devices):
            dev_grads = grads[i]
            grad_vars.append(dev_grads[param_to_test][0])
        grad_values = sess.run(grad_vars, feed_dict=canary_feed_dict)
        failed = False
        for i in xrange(1, num_devices):
            if grad_values[0].shape != grad_values[i].shape:
                print("shape 0 does not match shape %d:" % i,
                      grad_values[0].shape, grad_values[i].shape)
                failed = True
                continue
            if not np.allclose(grad_values[0], grad_values[i], atol=1e-6):
                print("grad_values[0]: ", grad_values[0].mean(),
                      grad_values[0].max())
                print("grad_values[%d]: " % i, grad_values[i].mean(),
                      grad_values[i].max())
                print("max diff: ",
                      np.abs(grad_values[0] - grad_values[1]).max())
                failed = True
        if failed:
            print("Canary failed.")
            quit()

    for epoch in xrange(args.nb_epochs):
        # Indices to shuffle training set
        index_shuf = list(range(len(x_train)))
        # Randomly repeat a few training examples each epoch to avoid
        # having a too-small batch
        while len(index_shuf) % batch_size != 0:
            index_shuf.append(rng.randint(len(x_train)))
        nb_batches = len(index_shuf) // batch_size
        rng.shuffle(index_shuf)
        # Shuffling here versus inside the loop doesn't seem to affect
        # timing very much, but shuffling here makes the code slightly
        # easier to read
        x_train_shuffled = x_train[index_shuf]
        y_train_shuffled = y_train[index_shuf]

        prev = time.time()
        for batch in range(nb_batches):

            # Compute batch start and end indices
            start = batch * batch_size
            end = (batch + 1) * batch_size

            # Perform one training step
            feed_dict = {epoch_tf: epoch, batch_tf: batch}
            diff = end - start
            assert diff == batch_size
            for dev_idx in xrange(num_devices):
                cur_start = start + dev_idx * device_batch_size
                cur_end = start + (dev_idx + 1) * device_batch_size
                feed_dict[xs[dev_idx]] = x_train_shuffled[cur_start:cur_end]
                feed_dict[ys[dev_idx]] = y_train_shuffled[cur_start:cur_end]
            if cur_end != end:
                msg = ("batch_size (%d) must be a multiple of num_devices "
                       "(%d).\nCUDA_VISIBLE_DEVICES: %s"
                       "\ndevices: %s")
                args = (batch_size, num_devices,
                        os.environ['CUDA_VISIBLE_DEVICES'], str(devices))
                raise ValueError(msg % args)
            if feed is not None:
                feed_dict.update(feed)

            _, loss_numpy = sess.run([train_step, loss_value],
                                     feed_dict=feed_dict)

            if np.abs(loss_numpy) > loss_threshold:
                raise ValueError("Extreme loss during training: ", loss_numpy)
            if np.isnan(loss_numpy) or np.isinf(loss_numpy):
                raise ValueError("NaN/Inf loss during training")
        assert end == len(index_shuf)  # Check that all examples were used
        cur = time.time()
        _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) +
                     " seconds")
        if evaluate is not None:
            if use_ema:
                # Before running evaluation, load the running average
                # parameters into the live slot, so we can see how well
                # the EMA parameters are performing
                sess.run(swap)
            evaluate()
            if use_ema:
                # Swap the parameters back, so that we continue training
                # on the live parameters
                sess.run(swap)
    if use_ema:
        # When training is done, swap the running average parameters into
        # the live slot, so that we use them when we deploy the model
        sess.run(swap)

    return True
예제 #11
0
def make_curve(report, success_name, fail_names):
  """
  Make a success-failure curve.
  :param report: A confidence report
    (the type of object saved by make_confidence_report.py)
  :param success_name: see plot_report_from_path
  :param fail_names: see plot_report_from_path
  :returns:
    fail_optimal: list of failure rates on adversarial data for the optimal
      (t >= .5) part of the curve. Each entry corresponds to a different
      threshold. Thresholds are chosen to make the smoothest possible curve
      from the available data, e.g. one threshold between each unique
      confidence value observed in the data. To make sure that linear
      interpolation between points in the curve never overestimates the
      failure rate for a specific success rate, the curve also includes
      extra points that increment the failure rate prior to any point
      that increments the success rate, so the curve moves up and to the
      right in a series of backwards "L" shapes rather than moving up
      and to the right along diagonal lines. For large datasets these
      maximally pessimistic points will usually not be visible and the
      curve will appear smooth.
    success_optimal: list of success rates on clean data on the optimal
      part of the curve. Matches up with `fail_optimal`.
    fail_lower_bound: list of observed failure rates on the t < .5 portion
      of the curve where MaxConfidence is not optimal.
    fail_upper_bound: list of upper bounds (assuming good enough optimization,
      so not a true upper bound) on the failure rates on the t < .5 portion
      of the curve where MaxConfidence is not optimal. Matches up with
      `fail_lower_bound`.
    success_bounded: success rates on the non-optimal part of the curve.
      Matches up with `fail_lower_bound` and `fail_upper_bound`.
  """
  success_results = report[success_name]
  fail_name = None # pacify pylint
  found = False
  for fail_name in fail_names:
    if fail_name in report:
      found = True
      break
  if not found:
    raise ValueError(fail_name + " not in report."
                     "Available keys: " + str(report.keys()))
  fail_results = report[fail_name]

  # "good" means drawn from the distribution where we measure success rate.
  # "bad" means drawn from the distribution where we measure failure rate.
  # From here on out we use those terms, to avoid confusion between examples
  # that actually failed and examples that were drawn from the distribution
  # where we measured failure rate.

  if 'all_probs' in success_results:
    warnings.warn("The 'all_probs' key is included only to support "
                  " old files from a private development codebase. "
                  "Support for this key can be dropped at any time "
                  " without warning.")
    good_probs = success_results['all_probs']
    bad_probs = fail_results['all_probs']
    bad_corrects = fail_results['correctness_mask']
    good_corrects = success_results['correctness_mask']
  else:
    good_probs = success_results['confidence']
    bad_probs = fail_results['confidence']
    good_corrects = success_results['correctness']
    bad_corrects = fail_results['correctness']
  good_triplets = [(prob, correct, True) for prob, correct
                   in safe_zip(good_probs, good_corrects)]
  bad_triplets = [(prob, correct, False) for prob, correct
                  in safe_zip(bad_probs, bad_corrects)]
  total_good = len(good_triplets)
  total_bad = len(bad_triplets)
  if total_good != 10000:
    warnings.warn("Not using full test set? Found " + str(total_good) +
                  " examples for measuring success rate")
  if total_bad != 10000:
    warnings.warn("Not using full test set for adversarial examples?")
  all_triplets = good_triplets + bad_triplets
  all_triplets = sorted(all_triplets, key=lambda x: -x[0])

  # Start with the case for threshold t = 1.
  # Examples are covered only if prob > t (strict inequality)
  # So initially nothing is covered
  good_covered_and_correct = 0
  bad_covered_and_incorrect = 0

  # Number of examples that are bad, incorrect, and covered by
  # a t >= 0.5, or that were merely covered by a t < 0.5
  failure_opportunities = 0

  next_idx = 0

  fail_optimal = []
  success_optimal = []
  fail_upper_bound = []
  fail_lower_bound = []
  success_bounded = []

  bounded = False

  # NOTE: the loop always exits via an internal break statement.
  # Copied the termination condition to the while statement for ease
  # of reading.
  while next_idx < len(all_triplets):
    gs = float(good_covered_and_correct) / total_good
    bf = float(bad_covered_and_incorrect) / total_bad
    # Add results for current threshold to the list
    if not bounded:

      # Sometimes when there are big jumps the failure rate it makes
      # artifacts in the plot, where there's a long linear track.
      # This implies the real success-fail curve is linear when
      # actually it just isn't sampled by the data.
      # To avoid implying that the model reaches a higher success
      # rate than it actually does, we avoid these plotting artifacts
      # by introducing extra points that make the graph move horizontally
      # to the right first, then vertically.
      if len(fail_optimal) > 0:
        prev_bf = fail_optimal[-1]
        prev_gs = success_optimal[-1]

        if gs > prev_gs and bf > prev_bf:
          fail_optimal.append(bf)
          success_optimal.append(prev_gs)

      success_optimal.append(gs)
      fail_optimal.append(bf)
    else:
      success_bounded.append(gs)
      fail_lower_bound.append(bf)
      fail_upper_bound.append(float(failure_opportunities) / total_bad)

    if next_idx == len(all_triplets):
      break

    # next_prob_to_include is not quite the same thing as the threshold.
    # The threshold is infinitesimally smaller than this value.
    next_prob_to_include = all_triplets[next_idx][0]

    # Process all ties
    while next_prob_to_include == all_triplets[next_idx][0]:
      _prob, correct, is_good = all_triplets[next_idx]
      if is_good:
        good_covered_and_correct += correct
      else:
        if next_prob_to_include <= .5:
          failure_opportunities += 1
        else:
          failure_opportunities += 1 - correct
        bad_covered_and_incorrect += 1 - correct
      next_idx += 1
      if next_idx == len(all_triplets):
        break

    if next_prob_to_include <= .5:
      bounded = True

  out = (fail_optimal, success_optimal, fail_lower_bound, fail_upper_bound,
         success_bounded)
  return out
예제 #12
0
def train_ae(sess,
             loss,
             x_train,
             x_train_target,
             init_all=False,
             evaluate=None,
             feed=None,
             args=None,
             rng=None,
             var_list=None,
             fprop_args=None,
             optimizer=None,
             devices=None,
             x_batch_preprocessor=None,
             use_ema=False,
             ema_decay=.998,
             run_canary=None,
             loss_threshold=1e5,
             dataset_train=None,
             dataset_size=None):
    # Check whether the hardware is working correctly
    start_time = time.time()
    canary.run_canary()
    if run_canary is not None:
        warnings.warn("The `run_canary` argument is deprecated. The canary "
                      "is now much cheaper and thus runs all the time. The "
                      "canary now uses its own loss function so it is not "
                      "necessary to turn off the canary when training with "
                      " a stochastic loss. Simply quit passing `run_canary`."
                      "Passing `run_canary` may become an error on or after "
                      "2019-10-16.")

    args = _ArgsWrapper(args or {})
    fprop_args = fprop_args or {}

    # Check that necessary arguments were given (see doc above)
    # Be sure to support 0 epochs for debugging purposes
    if args.nb_epochs is None:
        raise ValueError("`args` must specify number of epochs")
    if optimizer is None:
        if args.learning_rate is None:
            raise ValueError("Learning rate was not given in args dict")
    assert args.batch_size, "Batch size was not given in args dict"

    if rng is None:
        rng = np.random.RandomState()

    if optimizer is None:
        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    else:
        if not isinstance(optimizer, tf.train.Optimizer):
            raise ValueError("optimizer object must be from a child class of "
                             "tf.train.Optimizer")

    grads = []
    xs = []
    xs_t = []
    preprocessed_xs = []
    preprocessed_xs_t = []
    #ys = []
    if dataset_train is not None:
        assert x_train is None and x_batch_preprocessor is None
        if dataset_size is None:
            raise ValueError("You must provide a dataset size")
        data_iterator = dataset_train.make_one_shot_iterator().get_next()
        x_train, x_train_target = sess.run(data_iterator)

    devices = infer_devices(devices)
    for device in devices:
        with tf.device(device):
            x = tf.placeholder(x_train.dtype, (None, ) + x_train.shape[1:])
            x_t = tf.placeholder(x_train_target.dtype,
                                 (None, ) + x_train_target.shape[1:])
            #y = tf.placeholder(y_train.dtype, (None,) + y_train.shape[1:])
            xs.append(x)
            xs_t.append(x_t)
            #ys.append(y)

            if x_batch_preprocessor is not None:
                x = x_batch_preprocessor(x)
                x_t = x_batch_preprocessor(x_t)

            # We need to keep track of these so that the canary can feed
            # preprocessed values. If the canary had to feed raw values,
            # stochastic preprocessing could make the canary fail.
            preprocessed_xs.append(x)
            preprocessed_xs_t.append(x_t)

            loss_value = loss.fprop(x, x_t, **fprop_args)

            grads.append(
                optimizer.compute_gradients(loss_value, var_list=var_list))
    num_devices = len(devices)
    print("num_devices: ", num_devices)

    grad = avg_grads(grads)
    # Trigger update operations within the default graph (such as batch_norm).
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_step = optimizer.apply_gradients(grad)

    epoch_tf = tf.placeholder(tf.int32, [])
    batch_tf = tf.placeholder(tf.int32, [])

    if use_ema:
        if callable(ema_decay):
            ema_decay = ema_decay(epoch_tf, batch_tf)
        ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
        with tf.control_dependencies([train_step]):
            train_step = ema.apply(var_list)
        # Get pointers to the EMA's running average variables
        avg_params = [ema.average(param) for param in var_list]
        # Make temporary buffers used for swapping the live and running average
        # parameters
        tmp_params = [
            tf.Variable(param, trainable=False) for param in var_list
        ]
        # Define the swapping operation
        param_to_tmp = [
            tf.assign(tmp, param)
            for tmp, param in safe_zip(tmp_params, var_list)
        ]
        with tf.control_dependencies(param_to_tmp):
            avg_to_param = [
                tf.assign(param, avg)
                for param, avg in safe_zip(var_list, avg_params)
            ]
        with tf.control_dependencies(avg_to_param):
            tmp_to_avg = [
                tf.assign(avg, tmp)
                for avg, tmp in safe_zip(avg_params, tmp_params)
            ]
        swap = tmp_to_avg

    batch_size = args.batch_size

    assert batch_size % num_devices == 0
    device_batch_size = batch_size // num_devices

    if init_all:
        sess.run(tf.global_variables_initializer())
    else:
        initialize_uninitialized_global_variables(sess)

    for epoch in xrange(args.nb_epochs):
        if dataset_train is not None:
            nb_batches = int(math.ceil(float(dataset_size) / batch_size))
        else:
            # Indices to shuffle training set
            index_shuf = list(range(len(x_train)))
            # Randomly repeat a few training examples each epoch to avoid
            # having a too-small batch
            while len(index_shuf) % batch_size != 0:
                index_shuf.append(rng.randint(len(x_train)))
            nb_batches = len(index_shuf) // batch_size
            rng.shuffle(index_shuf)
            # Shuffling here versus inside the loop doesn't seem to affect
            # timing very much, but shuffling here makes the code slightly
            # easier to read
            x_train_shuffled = x_train[index_shuf]
            x_train_target_shuffled = x_train_target[index_shuf]
            #y_train_shuffled = y_train[index_shuf]

        prev = time.time()
        for batch in range(nb_batches):
            if dataset_train is not None:
                x_train_shuffled, x_train_target_shuffled = sess.run(
                    data_iterator)
                start, end = 0, batch_size
            else:
                # Compute batch start and end indices
                start = batch * batch_size
                end = (batch + 1) * batch_size
                # Perform one training step
                diff = end - start
                assert diff == batch_size

            feed_dict = {epoch_tf: epoch, batch_tf: batch}
            for dev_idx in xrange(num_devices):
                cur_start = start + dev_idx * device_batch_size
                cur_end = start + (dev_idx + 1) * device_batch_size
                feed_dict[xs[dev_idx]] = x_train_shuffled[cur_start:cur_end]
                feed_dict[
                    xs_t[dev_idx]] = x_train_target_shuffled[cur_start:cur_end]
                #feed_dict[ys[dev_idx]] = y_train_shuffled[cur_start:cur_end]
            if cur_end != end and dataset_train is None:
                msg = ("batch_size (%d) must be a multiple of num_devices "
                       "(%d).\nCUDA_VISIBLE_DEVICES: %s"
                       "\ndevices: %s")
                args = (batch_size, num_devices,
                        os.environ['CUDA_VISIBLE_DEVICES'], str(devices))
                raise ValueError(msg % args)
            if feed is not None:
                feed_dict.update(feed)

            _, loss_numpy = sess.run([train_step, loss_value],
                                     feed_dict=feed_dict)

            if np.abs(loss_numpy) > loss_threshold:
                raise ValueError("Extreme loss during training: ", loss_numpy)
            if np.isnan(loss_numpy) or np.isinf(loss_numpy):
                raise ValueError("NaN/Inf loss during training")
        assert (dataset_train is not None
                or end == len(index_shuf))  # Check that all examples were used
        cur = time.time()
        _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) +
                     " seconds")
        if evaluate is not None:
            if use_ema:
                # Before running evaluation, load the running average
                # parameters into the live slot, so we can see how well
                # the EMA parameters are performing
                sess.run(swap)
            evaluate()
            if use_ema:
                # Swap the parameters back, so that we continue training
                # on the live parameters
                sess.run(swap)
    if use_ema:
        # When training is done, swap the running average parameters into
        # the live slot, so that we use them when we deploy the model
        sess.run(swap)
    end_time = time.time()
    print("Time taken for training: ", end_time - start_time)
    return True
예제 #13
0
파일: train.py 프로젝트: ATPGN/ATPGN
def train_with_PGN(sess, model, loss, train_type='naive', evaluate=None, args=None,
          rng=None, classifier_var_list=None, generator_var_list=None, save_dir=None,
          fprop_args=None, optimizer=None, use_ema=False, ema_decay=.998,
          loss_threshold=1e10, dataset_train=None, dataset_size=None):
  """
  Run (optionally multi-replica, synchronous) training to minimize `loss`
  :param sess: TF session to use when training the graph
  :param loss: tensor, the loss to minimize
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
  :param rng: Instance of numpy.random.RandomState
  :param var_list: Optional list of parameters to train.
  :param fprop_args: dict, extra arguments to pass to fprop (loss and model).
  :param optimizer: Optimizer to be used for training
  :param use_ema: bool
      If true, uses an exponential moving average of the model parameters
  :param ema_decay: float or callable
      The decay parameter for EMA, if EMA is used
      If a callable rather than a float, this is a callable that takes
      the epoch and batch as arguments and returns the ema_decay for
      the current batch.
  :param loss_threshold: float
      Raise an exception if the loss exceeds this value.
      This is intended to rapidly detect numerical problems.
      Sometimes the loss may legitimately be higher than this value. In
      such cases, raise the value. If needed it can be np.inf.
  :param dataset_train: tf Dataset instance.
      Used as a replacement for x_train, y_train for faster performance.
    :param dataset_size: integer, the size of the dataset_train.
  :return: True if model trained
  """

  # Check whether the hardware is working correctly
  canary.run_canary()
  args = _ArgsWrapper(args or {})
  fprop_args = fprop_args or {}

  # Check that necessary arguments were given (see doc above)
  # Be sure to support 0 epochs for debugging purposes
  if args.nb_epochs is None:
    raise ValueError("`args` must specify number of epochs")
  if optimizer is None:
    if args.learning_rate is None:
      raise ValueError("Learning rate was not given in args dict")
  assert args.batch_size, "Batch size was not given in args dict"
  assert dataset_train and dataset_size, "dataset_train or dataset_size was not given"

  if rng is None:
    rng = np.random.RandomState()

  if optimizer is None:
    optimizer = tf.train.AdamOptimizer(learning_rate = args.learning_rate)
  else:
    if not isinstance(optimizer, tf.train.Optimizer):
      raise ValueError("optimizer object must be from a child class of "
                       "tf.train.Optimizer")

  grads_classifier = []
  if train_type == 'PGN':
    grads_generator = []
  xs = []
  ys = []
  data_iterator = dataset_train.make_one_shot_iterator().get_next()
  x_train, y_train = sess.run(data_iterator)

  devices = infer_devices()
  for device in devices:
    with tf.device(device):
      x = tf.placeholder(x_train.dtype, (None,) + x_train.shape[1:])
      y = tf.placeholder(y_train.dtype, (None,) + y_train.shape[1:])
      xs.append(x)
      ys.append(y)
      if train_type == 'PGN':
        loss_classifier, loss_generator = loss.fprop(x, y, **fprop_args)
      else:
        loss_classifier = loss.fprop(x, y, **fprop_args)
      grads_classifier.append(optimizer.compute_gradients(loss_classifier, var_list=classifier_var_list))
      if train_type == 'PGN':
        grads_generator.append(optimizer.compute_gradients(loss_generator, var_list=generator_var_list))

  num_devices = len(devices)
  print("num_devices: ", num_devices)

  grad_classifier = avg_grads(grads_classifier)
  if train_type == 'PGN':
    grad_generator = avg_grads(grads_generator)
  # Trigger update operations within the default graph (such as batch_norm).
  with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    train_step = optimizer.apply_gradients(grad_classifier)
    if train_type == 'PGN':
      with tf.control_dependencies([train_step]):
        train_step = optimizer.apply_gradients(grad_generator)

  var_list = classifier_var_list
  if train_type == 'PGN':
    var_list += generator_var_list
  if use_ema:
    ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
    with tf.control_dependencies([train_step]):
      train_step = ema.apply(var_list)
    # Get pointers to the EMA's running average variables
    avg_params = [ema.average(param) for param in var_list]
    # Make temporary buffers used for swapping the live and running average
    # parameters
    tmp_params = [tf.Variable(param, trainable=False)
                  for param in var_list]
    # Define the swapping operation
    param_to_tmp = [tf.assign(tmp, param)
                    for tmp, param in safe_zip(tmp_params, var_list)]
    with tf.control_dependencies(param_to_tmp):
      avg_to_param = [tf.assign(param, avg)
                      for param, avg in safe_zip(var_list, avg_params)]
    with tf.control_dependencies(avg_to_param):
      tmp_to_avg = [tf.assign(avg, tmp)
                    for avg, tmp in safe_zip(avg_params, tmp_params)]
    swap = tmp_to_avg

  batch_size = args.batch_size

  assert batch_size % num_devices == 0
  device_batch_size = batch_size // num_devices

  sess.run(tf.global_variables_initializer())
  best_acc = 0.0

  for epoch in xrange(args.nb_epochs):
    nb_batches = int(math.ceil(float(dataset_size) / batch_size))
    prev = time.time()
    for batch in range(nb_batches):
      x_train_shuffled, y_train_shuffled = sess.run(data_iterator)
      start, end = 0, batch_size
      feed_dict = dict()
      for dev_idx in xrange(num_devices):
        cur_start = start + dev_idx * device_batch_size
        cur_end = start + (dev_idx + 1) * device_batch_size
        feed_dict[xs[dev_idx]] = x_train_shuffled[cur_start:cur_end]
        feed_dict[ys[dev_idx]] = y_train_shuffled[cur_start:cur_end]

      
      _, loss_classifier_numpy = sess.run([train_step, loss_classifier], feed_dict=feed_dict)

      if np.abs(loss_classifier_numpy) > loss_threshold:
        raise ValueError("Extreme loss_classifier during training: ", loss_classifier_numpy)
      if np.isnan(loss_classifier_numpy) or np.isinf(loss_classifier_numpy):
        raise ValueError("NaN/Inf loss_classifier during training")
    cur = time.time()
    _logger.info("Epoch " + str(epoch) + " took " +
                 str(cur - prev) + " seconds")
    if evaluate is not None:
      if use_ema:
        sess.run(swap)
      r_value = evaluate(epoch)

      if use_ema:
        sess.run(swap)
  if use_ema:
    sess.run(swap)

  with sess.as_default():
    save_path = os.path.join(save_dir,'model.joblib')
    save(save_path, model)

  return True
예제 #14
0
def train(sess, loss, x_train, y_train,
          init_all=True, evaluate=None, feed=None, args=None,
          rng=None, var_list=None, fprop_args=None, optimizer=None,
          devices=None, x_batch_preprocessor=None):
    """
    Run (optionally multi-replica, synchronous) training to minimize `loss`
    :param sess: TF session to use when training the graph
    :param loss: tensor, the loss to minimize
    :param x_train: numpy array with training inputs
    :param y_train: numpy array with training outputs
    :param init_all: (boolean) If set to true, all TF variables in the session
                     are (re)initialized, otherwise only previously
                     uninitialized variables are initialized before training.
    :param evaluate: function that is run after each training iteration
                     (typically to display the test/validation accuracy).
    :param feed: An optional dictionary that is appended to the feeding
                 dictionary before the session runs. Can be used to feed
                 the learning phase of a Keras model for instance.
    :param args: dict or argparse `Namespace` object.
                 Should contain `nb_epochs`, `learning_rate`,
                 `batch_size`
    :param rng: Instance of numpy.random.RandomState
    :param var_list: Optional list of parameters to train.
    :param fprop_args: dict, extra arguments to pass to fprop (loss and model).
    :param optimizer: Optimizer to be used for training
    :param devices: list of device names to use for training
        If None, defaults to: all GPUs, if GPUs are available
                              all devices, if no GPUs are available
    :param x_batch_preprocessor: callable
        Takes a single tensor containing an x_train batch as input
        Returns a single tensor containing an x_train batch as output
        Called to preprocess the data before passing the data to the Loss
    :return: True if model trained
    """
    args = _ArgsWrapper(args or {})
    fprop_args = fprop_args or {}

    # Check that necessary arguments were given (see doc above)
    assert args.nb_epochs, "Number of epochs was not given in args dict"
    if optimizer is None:
        if args.learning_rate is None:
            raise ValueError("Learning rate was not given in args dict")
    assert args.batch_size, "Batch size was not given in args dict"

    if rng is None:
        rng = np.random.RandomState()

    if optimizer is None:
        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    else:
        if not isinstance(optimizer, tf.train.Optimizer):
            raise ValueError("optimizer object must be from a child class of "
                             "tf.train.Optimizer")

    grads = []
    xs = []
    preprocessed_xs = []
    ys = []

    devices = infer_devices(devices)
    for idx, device in enumerate(devices):
        with tf.device(device):
            x = tf.placeholder(x_train.dtype, (None,) + x_train.shape[1:])
            y = tf.placeholder(x_train.dtype, (None,) + y_train.shape[1:])
            xs.append(x)
            ys.append(y)

            if x_batch_preprocessor is not None:
                x = x_batch_preprocessor(x)

            # We need to keep track of these so that the canary can feed
            # preprocessed values. If the canary had to feed raw values,
            # stochastic preprocessing could make the canary fail.
            preprocessed_xs.append(x)

            loss_value = loss.fprop(x, y, **fprop_args)

            grads.append(optimizer.compute_gradients(
                loss_value, var_list=var_list))
    num_devices = len(devices)
    print("num_devices: ", num_devices)

    grad = avg_grads(grads)
    # Trigger update operations within the default graph (such as batch_norm).
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_step = optimizer.apply_gradients(grad)

    batch_size = args.batch_size

    assert batch_size % num_devices == 0
    device_batch_size = batch_size // num_devices

    if init_all:
        sess.run(tf.global_variables_initializer())
    else:
        initialize_uninitialized_global_variables(sess)

    # Check whether the hardware is working correctly

    # So far the failure has only been observed with 3 or more GPUs
    run_canary = num_devices > 2
    if run_canary:
        canary_feed_dict = {}
        for x, y in safe_zip(preprocessed_xs, ys):
            canary_feed_dict[x] = x_train[:device_batch_size].copy()
            canary_feed_dict[y] = y_train[:device_batch_size].copy()
        # To reduce the runtime and memory cost of this canary,
        # we test the gradient of only one parameter.
        # For now this is just set to the first parameter in the list,
        # because it is an index that is always guaranteed to work.
        # If we think that this is causing false negatives and we should
        # test other parameters, we could test a random parameter from
        # the list or we could rewrite the canary to examine more than
        # one parameter.
        param_to_test = 0
        grad_vars = []
        for i in xrange(num_devices):
            dev_grads = grads[i]
            grad_vars.append(dev_grads[param_to_test][0])
        grad_values = sess.run(grad_vars, feed_dict=canary_feed_dict)
        failed = False
        for i in xrange(1, num_devices):
            if grad_values[0].shape != grad_values[i].shape:
                print("shape 0 does not match shape %d:" % i,
                      grad_values[0].shape, grad_values[i].shape)
                failed = True
                continue
            if not np.allclose(grad_values[0], grad_values[i], atol=1e-6):
                print("grad_values[0]: ",
                      grad_values[0].mean(), grad_values[0].max())
                print("grad_values[%d]: " %
                      i, grad_values[i].mean(), grad_values[i].max())
                print("max diff: ", np.abs(
                    grad_values[0] - grad_values[1]).max())
                failed = True
        if failed:
            print("Canary failed.")
            quit()

    for epoch in xrange(args.nb_epochs):
        # Indices to shuffle training set
        index_shuf = list(range(len(x_train)))
        # Randomly repeat a few training examples each epoch to avoid
        # having a too-small batch
        while len(index_shuf) % batch_size != 0:
            index_shuf.append(rng.randint(len(x_train)))
        nb_batches = len(index_shuf) // batch_size
        rng.shuffle(index_shuf)
        # Shuffling here versus inside the loop doesn't seem to affect
        # timing very much, but shuffling here makes the code slightly
        # easier to read
        x_train_shuffled = x_train[index_shuf]
        y_train_shuffled = y_train[index_shuf]

        prev = time.time()
        for batch in range(nb_batches):

            # Compute batch start and end indices
            start = batch * batch_size
            end = (batch + 1) * batch_size
            # start, end = batch_indices(
            #    batch, len(x_train), args.batch_size)

            # Perform one training step
            feed_dict = {}
            diff = end - start
            assert diff == batch_size
            for dev_idx in xrange(num_devices):
                cur_start = start + dev_idx * device_batch_size
                cur_end = start + (dev_idx + 1) * device_batch_size
                feed_dict[xs[dev_idx]
                          ] = x_train_shuffled[cur_start:cur_end]
                feed_dict[ys[dev_idx]
                          ] = y_train_shuffled[cur_start:cur_end]
            if cur_end != end:
                msg = ("batch_size (%d) must be a multiple of num_devices "
                       "(%d).\nCUDA_VISIBLE_DEVICES: %s"
                       "\ndevices: %s")
                args = (batch_size, num_devices,
                        os.environ['CUDA_VISIBLE_DEVICES'],
                        str(devices))
                raise ValueError(msg % args)
            if feed is not None:
                feed_dict.update(feed)
            sess.run(train_step, feed_dict=feed_dict)
        assert end == len(index_shuf)  # Check that all examples were used
        cur = time.time()
        _logger.info("Epoch " + str(epoch) + " took " +
                     str(cur - prev) + " seconds")
        if evaluate is not None:
            evaluate()

    return True
예제 #15
0
def train_with_noise(sess, loss, x_train, y_train,
          init_all=False, evaluate=None, feed=None, args=None,
          rng=None, var_list=None, fprop_args=None, optimizer=None,
          devices=None, x_batch_preprocessor=None, use_ema=False,
          ema_decay=.998, run_canary=None,
          loss_threshold=1e5, dataset_train=None, dataset_size=None,
          save=False, type="normal", datasetName="MNIST", retrain=False, discretizeColor=1):
  """
  Run (optionally multi-replica, synchronous) training to minimize `loss`
  :param sess: TF session to use when training the graph
  :param loss: tensor, the loss to minimize
  :param x_train: numpy array with training inputs or tf Dataset
  :param y_train: numpy array with training outputs or tf Dataset
  :param init_all: (boolean) If set to true, all TF variables in the session
                   are (re)initialized, otherwise only previously
                   uninitialized variables are initialized before training.
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param feed: An optional dictionary that is appended to the feeding
               dictionary before the session runs. Can be used to feed
               the learning phase of a Keras model for instance.
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
  :param rng: Instance of numpy.random.RandomState
  :param var_list: Optional list of parameters to train.
  :param fprop_args: dict, extra arguments to pass to fprop (loss and model).
  :param optimizer: Optimizer to be used for training
  :param devices: list of device names to use for training
      If None, defaults to: all GPUs, if GPUs are available
                            all devices, if no GPUs are available
  :param x_batch_preprocessor: callable
      Takes a single tensor containing an x_train batch as input
      Returns a single tensor containing an x_train batch as output
      Called to preprocess the data before passing the data to the Loss
  :param use_ema: bool
      If true, uses an exponential moving average of the model parameters
  :param ema_decay: float or callable
      The decay parameter for EMA, if EMA is used
      If a callable rather than a float, this is a callable that takes
      the epoch and batch as arguments and returns the ema_decay for
      the current batch.
  :param loss_threshold: float
      Raise an exception if the loss exceeds this value.
      This is intended to rapidly detect numerical problems.
      Sometimes the loss may legitimately be higher than this value. In
      such cases, raise the value. If needed it can be np.inf.
  :param dataset_train: tf Dataset instance.
      Used as a replacement for x_train, y_train for faster performance.
    :param dataset_size: integer, the size of the dataset_train.
  :return: True if model trained
  """

  _, width, height, channel = list(np.shape(x_train))

  # Check whether the hardware is working correctly
  canary.run_canary()
  if run_canary is not None:
    warnings.warn("The `run_canary` argument is deprecated. The canary "
                  "is now much cheaper and thus runs all the time. The "
                  "canary now uses its own loss function so it is not "
                  "necessary to turn off the canary when training with "
                  " a stochastic loss. Simply quit passing `run_canary`."
                  "Passing `run_canary` may become an error on or after "
                  "2019-10-16.")

  args = _ArgsWrapper(args or {})
  fprop_args = fprop_args or {}

  # Check that necessary arguments were given (see doc above)
  # Be sure to support 0 epochs for debugging purposes
  if args.nb_epochs is None:
    raise ValueError("`args` must specify number of epochs")
  if optimizer is None:
    if args.learning_rate is None:
      raise ValueError("Learning rate was not given in args dict")
  assert args.batch_size, "Batch size was not given in args dict"

  if rng is None:
    rng = np.random.RandomState()

  if optimizer is None:
    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
  else:
    if not isinstance(optimizer, tf.train.Optimizer):
      raise ValueError("optimizer object must be from a child class of "
                       "tf.train.Optimizer")

  grads = []
  xs = []
  preprocessed_xs = []
  ys = []
  if dataset_train is not None:
    assert x_train is None and y_train is None and x_batch_preprocessor is None
    if dataset_size is None:
      raise ValueError("You must provide a dataset size")
    data_iterator = dataset_train.make_one_shot_iterator().get_next()
    x_train, y_train = sess.run(data_iterator)

  devices = infer_devices(devices)
  for device in devices:
    with tf.device(device):
      x = tf.placeholder(x_train.dtype, (None,) + x_train.shape[1:])
      y = tf.placeholder(y_train.dtype, (None,) + y_train.shape[1:])
      xs.append(x)
      ys.append(y)

      if x_batch_preprocessor is not None:
        x = x_batch_preprocessor(x)

      # We need to keep track of these so that the canary can feed
      # preprocessed values. If the canary had to feed raw values,
      # stochastic preprocessing could make the canary fail.
      preprocessed_xs.append(x)

      loss_value = loss.fprop(x, y, **fprop_args)

      grads.append(optimizer.compute_gradients(
          loss_value, var_list=var_list))
  num_devices = len(devices)
  print("num_devices: ", num_devices)

  grad = avg_grads(grads)
  # Trigger update operations within the default graph (such as batch_norm).
  with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
    train_step = optimizer.apply_gradients(grad)

  epoch_tf = tf.placeholder(tf.int32, [])
  batch_tf = tf.placeholder(tf.int32, [])

  if use_ema:
    if callable(ema_decay):
      ema_decay = ema_decay(epoch_tf, batch_tf)
    ema = tf.train.ExponentialMovingAverage(decay=ema_decay)
    with tf.control_dependencies([train_step]):
      train_step = ema.apply(var_list)
    # Get pointers to the EMA's running average variables
    avg_params = [ema.average(param) for param in var_list]
    # Make temporary buffers used for swapping the live and running average
    # parameters
    tmp_params = [tf.Variable(param, trainable=False)
                  for param in var_list]
    # Define the swapping operation
    param_to_tmp = [tf.assign(tmp, param)
                    for tmp, param in safe_zip(tmp_params, var_list)]
    with tf.control_dependencies(param_to_tmp):
      avg_to_param = [tf.assign(param, avg)
                      for param, avg in safe_zip(var_list, avg_params)]
    with tf.control_dependencies(avg_to_param):
      tmp_to_avg = [tf.assign(avg, tmp)
                    for avg, tmp in safe_zip(avg_params, tmp_params)]
    swap = tmp_to_avg

  batch_size = args.batch_size

  assert batch_size % num_devices == 0
  device_batch_size = batch_size // num_devices

  saver = tf.train.Saver(max_to_keep=100)
  startingEpoch = 0
  # if retrainEpoch is not None:
  #   startingEpoch = retrainEpoch
  if init_all:
    sess.run(tf.global_variables_initializer())
  else:
    initialize_uninitialized_global_variables(sess)

  # Used for retraining the model
  if retrain == True:
    print("Retrain is in progress...")
    retrain = False # Set to false once it has retrained just in case we have run this script for multiple loops
    latestFileName = tf.train.latest_checkpoint(args.train_dir, latest_filename=None)
    splitFileName = latestFileName.split("-")

    startingEpoch = int(splitFileName[len(splitFileName)-1])
    model_path = os.path.join(args.train_dir, args.filename+"-"+str(startingEpoch))
    print("Trying to load trained model from: "+model_path)
    if os.path.exists(model_path + ".meta"):
      tf_model_load(sess, model_path)
      print("Load trained model")

  # x_train = x_train[0:10]
  feed_x_train = x_train
  if type == "normal":
    feed_x_train = convert_uniimage(x_train, discretizeColor)

  for epoch in xrange(startingEpoch, args.nb_epochs):
    tmpX = np.copy(x_train)
    if type == "noise":
      # Use it for MNIST and Fashion MNIST
      if datasetName == "MNIST":
        tmpX = np.clip(x_train+(np.random.uniform(0, 0.8, (len(x_train), width, height, channel)) - 0.4), 0, 1)

      # Use it for MNIST and Fashion CIFAR10
      if datasetName == "CIFAR10":
        tmpX = np.clip(x_train+(np.random.uniform(0, 0.3, (len(x_train), width, height, channel)) - 0.15), 0, 1)
      feed_x_train = convert_uniimage(tmpX, discretizeColor)
    ##################
    # Showing images #
    ##################
    showImg = True
    showImg = False
    if showImg:
      shapeImg = (width, height, channel)
      if channel == 1:
        shapeImg = (width, height)
      for iii in range(len(feed_x_train)):
        fig = plt.figure()
        pixels = x_train[iii].reshape(shapeImg)
        sub = fig.add_subplot(1, 4, 1)
        plt.imshow(pixels, cmap='gray')
        pixels = tmpX[iii].reshape(shapeImg)
        sub = fig.add_subplot(1, 4, 2)
        plt.imshow(pixels, cmap='gray')
        pixels = feed_x_train[iii].reshape(shapeImg)
        sub = fig.add_subplot(1, 4, 3)
        plt.imshow(pixels, cmap='gray')
        # pixels = X_cur[iii].reshape((width, height, channel))
        # sub = fig.add_subplot(1, 4, 4)
        # plt.imshow(pixels, cmap='gray')
        # pixels = adv_x[iii].reshape((28, 28)) - xtrain[iii].reshape((28, 28))
        # print(np.mean(np.sum((adv_x[iii:iii+1] - xtrain[iii:iii+1]) ** 2,
        #        axis=(1, 2, 3)) ** .5))
        # sub = fig.add_subplot(1, 3, iii+3)
        # plt.imshow(pixels / abs(pixels).max() * 0.2 + 0.5, cmap='gray')

        plt.show()

    if dataset_train is not None:
      nb_batches = int(math.ceil(float(dataset_size) / batch_size))
    else:
      # Indices to shuffle training set
      index_shuf = list(range(len(x_train)))
      # Randomly repeat a few training examples each epoch to avoid
      # having a too-small batch
      while len(index_shuf) % batch_size != 0:
        index_shuf.append(rng.randint(len(x_train)))
      nb_batches = len(index_shuf) // batch_size
      rng.shuffle(index_shuf)
      # Shuffling here versus inside the loop doesn't seem to affect
      # timing very much, but shuffling here makes the code slightly
      # easier to read
      x_train_shuffled = feed_x_train[index_shuf]
      y_train_shuffled = y_train[index_shuf]

    prev = time.time()
    for batch in range(nb_batches):
      if dataset_train is not None:
        x_train_shuffled, y_train_shuffled = sess.run(data_iterator)
        start, end = 0, batch_size
      else:
        # Compute batch start and end indices
        start = batch * batch_size
        end = (batch + 1) * batch_size
        # Perform one training step
        diff = end - start
        assert diff == batch_size

      feed_dict = {epoch_tf: epoch, batch_tf: batch}
      for dev_idx in xrange(num_devices):
        cur_start = start + dev_idx * device_batch_size
        cur_end = start + (dev_idx + 1) * device_batch_size
        feed_dict[xs[dev_idx]] = x_train_shuffled[cur_start:cur_end]
        feed_dict[ys[dev_idx]] = y_train_shuffled[cur_start:cur_end]
      if cur_end != end and dataset_train is None:
        msg = ("batch_size (%d) must be a multiple of num_devices "
               "(%d).\nCUDA_VISIBLE_DEVICES: %s"
               "\ndevices: %s")
        args = (batch_size, num_devices,
                os.environ['CUDA_VISIBLE_DEVICES'],
                str(devices))
        raise ValueError(msg % args)
      if feed is not None:
        feed_dict.update(feed)

      _, loss_numpy = sess.run(
          [train_step, loss_value], feed_dict=feed_dict)

      if np.abs(loss_numpy) > loss_threshold:
        raise ValueError("Extreme loss during training: ", loss_numpy)
      if np.isnan(loss_numpy) or np.isinf(loss_numpy):
        raise ValueError("NaN/Inf loss during training")
    assert (dataset_train is not None or
            end == len(index_shuf))  # Check that all examples were used
    cur = time.time()
    _logger.info("Epoch " + str(epoch) + " took " +
                 str(cur - prev) + " seconds")
    if evaluate is not None:
      if use_ema:
        # Before running evaluation, load the running average
        # parameters into the live slot, so we can see how well
        # the EMA parameters are performing
        sess.run(swap)
      if (epoch + 1) % 10 == 0 or (epoch + 1) == args.nb_epochs:
        evaluate()
      if use_ema:
        # Swap the parameters back, so that we continue training
        # on the live parameters
        sess.run(swap)

    if save and ((epoch + 1) % 50 == 0 or (epoch + 1) == args.nb_epochs):
      with tf.device('/CPU:0'):
        save_path = os.path.join(args.train_dir, args.filename)
        if tf.gfile.Exists(args.train_dir) == False:
          tf.gfile.MakeDirs(args.train_dir)
        saver.save(sess, save_path, global_step=(epoch + 1))
      _logger.info("Reaching save point at " + str(epoch + 1) + ": " +
                   str(save_path))

  if use_ema:
    # When training is done, swap the running average parameters into
    # the live slot, so that we use them when we deploy the model
    sess.run(swap)



  return True