Ejemplo n.º 1
0
def main():

  ####################################################################################################

  title = opts.title
  seed = opts.seed
  mode = opts.mode

  gpu_list = opts.gpu_list
  batch_size = opts.batch_size

  dataset = opts.dataset
  preprocess = opts.preprocess
  network = opts.network
  optimizer = opts.optimizer
  lr_decay = opts.lr_decay
  epoch_step = opts.epoch_step
  learning_step = opts.learning_step

  path_load = opts.path_load
  path_save = opts.path_save

  print_line()

  ####################################################################################################

  time_tag = get_time('%y-%m-%d %X')
  time_tag_short = time_tag[:8]
  seed = set_seed(seed)

  num_check_log = 0
  title_temp = title
  while True:
    path_log = '../log/' + time_tag_short + '(' + title_temp + ').txt'
    if os.path.isfile(path_log) and title != 'temp':  # if title is 'temp', we will overwrite it
      num_check_log += 1
      title_temp = title + '_%d' % num_check_log
    else:
      title = title_temp
      del num_check_log, title_temp
      break

  print('title: ' + title)
  set_log(path_log)
  print_line()

  ####################################################################################################

  print(time_tag)
  print('SEED = %d' % seed)

  print_opts('options/' + OPTION + '.py')
  print_line()

  ####################################################################################################

  model_dir = '../model/'

  if isinstance(path_save, bool):
    # if title is 'temp', we will not save model
    path_save = model_dir + time_tag_short + '(' + title + ').tf' if path_save and title != 'temp' else None

  if path_load is not None:
    # key word search
    list = glob.glob(model_dir + '*' + path_load + '*.tf.data*')
    if len(list) == 0:
      raise FileNotFoundError('Could not find any model file match the key words' + path_load)
    elif len(list) > 1:
      for list_file in list:
        print(list_file)
      raise FileNotFoundError('Find more than one model file match the key words' + path_load)

    path_load = list[0][:list[0].find('.tf.') + 3]
    print('Find model in', path_load)

  ####################################################################################################

  os.environ['CUDA_VISIBLE_DEVICES'] = ''.join(str(gpu) + ',' for gpu in gpu_list)
  num_worker = max(len(gpu_list), 1)
  dataset_train = get_dataset(dataset, split='train')
  dataset_test = get_dataset(dataset, split='test')

  num_batch_train = dataset_train.num_sample // batch_size
  num_batch_test = dataset_test.num_sample // 100

  assert batch_size % num_worker == 0, 'batch_size %d can not be divided by number of workers %d' % (batch_size, num_worker)

  iterator_train = get_batch(dataset_train, preprocess, True, batch_size // num_worker, seed=seed)
  iterator_test = get_batch(dataset_test, preprocess, False, 100, seed=seed)

  ####################################################################################################

  if mode in ['input_train', 'input_test']:
    if mode == 'input_train':
      num_batch = num_batch_train
      batch_input = iterator_train.get_next()
    else:
      num_batch = num_batch_test
      batch_input = iterator_test.get_next()

    print('Testing the speed of data input pipeline.')
    sess = get_session()
    while True:
      for _ in tqdm(range(num_batch), desc='Input pipeline', leave=False, smoothing=0.1):
        sess.run(batch_input)

  ####################################################################################################

  nets = []
  net = get_net_fn(network)

  if num_worker == 1:
    if len(gpu_list) == 0:
      print('Multi-CPU training, it might be slow', )
      print('All parameters are pinned to CPU, all Ops are pinned to CPU')
      is_cpu_ps = True
    else:
      print('Single-GPU training with gpu', gpu_list[0])
      print('All parameters are pinned to GPU, all Ops are pinned to GPU')
      is_cpu_ps = False

  elif num_worker > 1:
    print('Multi-GPU training tower with gpu list', gpu_list)
    print('All parameters are pinned to CPU, all Ops are pinned to GPU')
    print('Get batchnorm moving average updates from data in the first GPU for speed')
    print('Get L2 decay grads in the second GPU for speed')
    is_cpu_ps = True
  else:
    raise NotImplementedError('Unrecognized device settings')

  tower_grads = []
  tower_losses = []
  tower_errors = []

  # Loops over the number of workers and creates a copy ("tower") of the model on each worker.
  for i in range(num_worker):

    worker = '/gpu:%d' % i if gpu_list else '/cpu:0'

    # Creates a device setter used to determine where Ops are to be placed.
    if is_cpu_ps:
      # tf.train.replica_device_setter supports placing variables on the CPU, all
      # on one GPU, or on ps_servers defined in a cluster_spec.
      device_setter = tf.train.replica_device_setter(worker_device=worker, ps_device='/cpu:0', ps_tasks=1)
    else:
      device_setter = worker

    '''
    1. pin ops to GPU
    2. pin parameters to CPU (multi-GPU training) or GPU (single-GPU training)
    3. reuse parameters multi-GPU training

    # Creates variables on the first loop.  On subsequent loops reuse is set
    # to True, which results in the "towers" sharing variables.
    # tf.device calls the device_setter for each Op that is created.
    # device_setter returns the device the Op is to be placed on.
    '''
    with tf.variable_scope(tf.get_variable_scope(), reuse=bool(i != 0)), \
         tf.device(device_setter):

      print('Training model on GPU %d' % gpu_list[i]) if gpu_list else print('Training model on CPUs')

      batch_train = iterator_train.get_next()

      if mode == 'speed_net':
        with tf.device('/cpu:0'):
          print('Testing the speed of model by synthesized data, '
                'which is theoretically the maximum speed for training this model')
          batch_train = iterator_train.get_next()
          shape_x = [batch_size // num_worker] + batch_train[0].get_shape().as_list()[1:]
          shape_y = [batch_size // num_worker] + batch_train[1].get_shape().as_list()[1:]

          batch_train_x = tf.zeros(shape_x, dtype=tf.float32)
          batch_train_y = tf.zeros(shape_y, dtype=tf.float32)
        batch_train = [batch_train_x, batch_train_y]

      nets.append(net(batch_train[0], batch_train[1], opts=opts, is_training=True))

      tower_losses.append(nets[i].loss)
      tower_errors.append(nets[i].error)

      if i == 0:
        # We only get batchnorm moving average updates from data in the first worker for speed
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        nets[-1].count_parameters()
        nets[-1].count_MACs()
        nets[-1].count_MEMs()

      loss_worker = nets[i].loss
      if num_worker == 1:
        # Single-GPU or multi-CPU training
        loss_worker += nets[i].get_l2_loss()
      elif i == 1:
        # We only compute L2 grads in the second worker for speed.
        # In this case, L2 grads should multiple num_worker to maintain the equivalence
        loss_worker += num_worker * nets[i].get_l2_loss()
      tower_grads.append(
        optimizer.compute_gradients(loss_worker, colocate_gradients_with_ops=True))

      if i == num_worker - 1:
        print('Testing model on GPU %d' % gpu_list[i]) if gpu_list else print('Testing model on CPUs')
        tf.get_variable_scope().reuse_variables()
        batch_test = iterator_test.get_next()
        nets.append(net(batch_test[0], batch_test[1], opts=opts, is_training=False))
        error_batch_test = nets[-1].error

        if mode in ['attack']:
          print('Attack model on GPU %d' % gpu_list[i - 1]) if gpu_list else print('Attack model on CPUs')
          tf.get_variable_scope().reuse_variables()
          batch_attack_x = tf.placeholder(shape=batch_test[0].get_shape(), dtype=batch_test[0].dtype)
          batch_attack_y = tf.placeholder(shape=batch_test[1].get_shape(), dtype=batch_test[1].dtype)
          nets.append(net(batch_attack_x, batch_attack_y, opts=opts, is_training=False))
          error_batch_attack = nets[-1].error

  with tf.device('/cpu:0' if is_cpu_ps else worker):
    grad_batch_train = aggregate_gradients(tower_grads)
    loss_batch_train = aggregate_statistics(tower_losses)
    error_batch_train = aggregate_statistics(tower_errors)

    with tf.control_dependencies(update_ops):
      train_op = optimizer.apply_gradients(grad_batch_train, global_step=learning_step)

  ####################################################################################################

  if hasattr(opts, 'delay'):
    delay4gpus(opts.delay, gpu_list=gpu_list)

  sess = get_session()
  saver = tf.train.Saver(max_to_keep=None)

  def evaluate():
    error_test = 0.
    for _ in tqdm(range(num_batch_test), desc='Test', leave=False, smoothing=0.1):
      error_test += sess.run(error_batch_test)
    return error_test / num_batch_test

  def attack(black=False, num_batch=None):
    error_fgsm = 0.
    delta = 1./64
    if num_batch is None: num_batch = num_batch_test

    if black is False:
      adversial_x = []
      adversial_y = []
      for _ in tqdm(range(num_batch), desc='Attack', leave=False, smoothing=0.1):
        test_x, test_y, grads = sess.run([nets[1].H[0], nets[1].Y[0], nets[1].grads_H[0]])
        fsgm_x = test_x + delta*np.sign(grads)
        error_fgsm += sess.run(error_batch_attack, feed_dict={batch_attack_x: fsgm_x, batch_attack_y: test_y})
        adversial_x.append(fsgm_x)
        adversial_y.append(test_y)

      adversial_x = np.array(adversial_x)
      adversial_y = np.array(adversial_y)
      np.savez('adversial_sample.npz', x=adversial_x, y=adversial_y)

    else:
      adversial_sample = np.load('adversial_sample.npz')
      adversial_x = adversial_sample['x']
      adversial_y = adversial_sample['y']
      for i in tqdm(range(adversial_x.shape[0]), desc='Attack', leave=False, smoothing=0.1):
        error_fgsm += sess.run(error_batch_attack,
                               feed_dict={batch_attack_x: adversial_x[i, ...], batch_attack_y: adversial_y[i, ...]})

    return error_fgsm / num_batch

  def save_model(path):
    saver.save(sess, path)
    print('S', end='')

  def load_model(path):
    print('Loading model from %s ...' % path_load)
    saver.restore(sess, path_load)

  if path_load is not None:
    load_model(path_load)
    error_test_best = evaluate()
    print('Test: %.4f' % error_test_best)

  if mode == 'attack':
    print(attack(black=False, num_batch=None))

  if mode == 'export':
    vars_list = get_variable('shift')[:48]
    vars_numpy = sess.run(vars_list)
    export(vars_numpy, 'shift')

  if mode in ['test', 'export', 'attack']:
    exit(0)

  print_line()

  ####################################################################################################

  while True:
    # update learning rate
    lr_epoch = sess.run(lr_decay)
    if lr_epoch <= 0:
      break
    epoch = sess.run(epoch_step)
    print('Epoch: %03d' % epoch, end=' ')

    loss_epoch = 0.
    error_epoch = 0.
    t0 = get_time()
    for batch in tqdm(range(num_batch_train), desc='Epoch: %03d' % epoch, leave=False, smoothing=0.1):

      if mode == 'debug':
        print('DEBUG: '),
        _, loss_delta, error_delta, H, W, gradsH, gradsW, label_ = sess.run(
          [train_op, loss_batch_train, error_batch_train, nets[0].H, nets[0].W, nets[0].grads_H, nets[0].grads_W,
           nets[0].Y])
      else:
        _, loss_delta, error_delta = sess.run([train_op, loss_batch_train, error_batch_train])

      loss_epoch += loss_delta
      error_epoch += error_delta

    print('Loss: %.6f Train: %.4f' % (loss_epoch / num_batch_train, error_epoch / num_batch_train), end=' ')
    FPS = num_batch_train * batch_size / (get_time() - t0)

    error_test = evaluate()
    assert error_test > 1e-4, ('Invalid test error %f, something goes wrong' % error_test)
    print('Test: %.4f lr: %.4f FPS: %d' % (error_test, lr_epoch, FPS), end=' ')

    sess.run(epoch_step.assign(epoch + 1))

    if epoch == 1:
      error_test_best = min(error_test, 0.9)
    if error_test < error_test_best:
      print('B', end=' ')
      if path_save is not None:
        save_model(path_save)
      error_test_best = error_test

    print('')

  print_line()

  ####################################################################################################

  sess.close()
  print('Optimization ended at ' + get_time('%y-%m-%d %X'))
  return 0
Ejemplo n.º 2
0
def main():

    ####################################################################################################

    title = opts.title
    seed = opts.seed
    mode = opts.mode

    gpu_list = opts.gpu_list
    batch_size = opts.batch_size

    dataset = opts.dataset
    preprocess = opts.preprocess
    network = opts.network
    optimizer = opts.optimizer
    lr_decay = opts.lr_decay
    epoch_step = opts.epoch_step
    learning_step = opts.learning_step

    path_load = opts.path_load
    path_save = opts.path_save

    print_line()

    ####################################################################################################

    time_tag = get_time('%y-%m-%d %X')
    time_tag_short = time_tag[:8]
    seed = set_seed(seed)

    num_check_log = 0
    title_temp = title
    while True:
        path_log = '../log/' + time_tag_short + '(' + title_temp + ').txt'
        if os.path.isfile(
                path_log
        ) and title != 'temp':  # if title is 'temp', we will overwrite it
            num_check_log += 1
            title_temp = title + '_%d' % num_check_log
        else:
            del num_check_log, title_temp
            break

    print('title: ' + title)
    set_log(path_log)
    print_line()

    ####################################################################################################

    print(time_tag)
    print('SEED = %d' % seed)

    print_opts('opts.py')
    print_line()

    ####################################################################################################

    if isinstance(path_save, bool):
        # if title is 'temp', we will not save model
        path_save = '../model/' + time_tag_short + '(' + title + ').tf' if path_save and title != 'temp' else None

    if path_load is not None:
        # key word search
        list = glob.glob('../model/*' + path_load + '*.tf.data*')
        assert len(list) == 1, 'Find none or more than one model file'
        path_load = list[0][:list[0].find('.tf.') + 3]
        print('Find model in', path_load)

    ####################################################################################################

    num_gpu = len(gpu_list)
    num_split = num_gpu if num_gpu > 0 else 1

    dataset_train = get_dataset(dataset, split='train')
    dataset_test = get_dataset(dataset, split='test')

    num_batch_train = dataset_train.num_sample // batch_size
    num_batch_test = dataset_test.num_sample // 100

    assert batch_size % num_split == 0, 'batch_size %d can not be divided by number of gpus %d' % (
        batch_size, num_split)

    iterator_train = get_batch(dataset_train,
                               preprocess,
                               True,
                               batch_size // num_split,
                               num_split,
                               seed=seed)
    iterator_test = get_batch(dataset_test,
                              preprocess,
                              False,
                              100,
                              num_split,
                              seed=seed)

    ####################################################################################################

    if mode in ['input_train', 'input_test']:
        if mode == 'input_train':
            num_batch = num_batch_train
            batch_input = iterator_train.get_next()
        else:
            num_batch = num_batch_test
            batch_input = iterator_test.get_next()

        sess = get_session(gpu_list)

        while (1):
            for batch in tqdm(range(num_batch),
                              desc='Input pipeline',
                              leave=False,
                              smoothing=0.1):
                sess.run(batch_input)

    ####################################################################################################

    nets = []
    net = get_net_fn(network)

    if num_gpu == 1:
        print('Single-GPU training with gpu', gpu_list[0])
        print('All parameters are pinned to GPU, all Ops are pinned to GPU')
        is_cpu_ps = False
    elif num_gpu > 1:
        print('Multi-GPU training tower with gpu list', gpu_list)
        print('All parameters are pinned to CPU, all Ops are pinned to GPU')
        print(
            'Get batchnorm moving average updates from data in the first GPU for speed'
        )
        print('Get L2 decay grads in the second GPU for speed')
        is_cpu_ps = True
    else:
        print('Training with only CPU, maybe very slow')
        print('All parameters are pinned to CPU, all Ops are pinned to CPU')
        is_cpu_ps = True

    tower_grads = []
    tower_losses = []
    tower_errors = []

    if num_gpu > 0:
        # Loops over the number of GPUs and creates a copy ("tower") of the model on each GPU.
        for i in range(num_gpu):

            worker = '/gpu:%d' % gpu_list[i]

            # Creates a device setter used to determine where Ops are to be placed.
            if is_cpu_ps:
                # tf.train.replica_device_setter supports placing variables on the CPU, all
                # on one GPU, or on ps_servers defined in a cluster_spec.
                device_setter = tf.train.replica_device_setter(
                    worker_device=worker, ps_device='/cpu:0', ps_tasks=1)
            else:
                device_setter = worker
            '''
      1. pin ops to GPU
      2. pin parameters to CPU (multi-GPU training) or GPU (single-GPU training)
      3. reuse parameters multi-GPU training
  
      # Creates variables on the first loop.  On subsequent loops reuse is set
      # to True, which results in the "towers" sharing variables.
      # tf.device calls the device_setter for each Op that is created.
      # device_setter returns the device the Op is to be placed on.
      '''
            with tf.variable_scope(tf.get_variable_scope(), reuse=bool(i != 0)), \
                 tf.device(device_setter):

                print('Training model on GPU %d' % gpu_list[i])

                if mode == 'speed_net':
                    with tf.device('/cpu:0'):
                        # use fake data to test the computation speed on GPU
                        batch_train = iterator_train.get_next()
                        shape_x = [batch_size // num_gpu
                                   ] + batch_train[0].get_shape().as_list()[1:]
                        shape_y = [batch_size // num_gpu
                                   ] + batch_train[1].get_shape().as_list()[1:]

                        batch_train_x = tf.zeros(shape_x, dtype=tf.float32)
                        batch_train_y = tf.zeros(shape_y, dtype=tf.float32)
                    batch_train = [batch_train_x, batch_train_y]
                else:
                    batch_train = iterator_train.get_next()

                nets.append(
                    net(batch_train[0], batch_train[1], is_training=True))

                tower_losses.append(nets[i].loss)
                tower_errors.append(nets[i].error)

                if i == 0:
                    # We only get batchnorm moving average updates from data in the first GPU for speed.
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    nets[-1].total_parameters()
                    nets[-1].total_MACs()

                loss_worker = nets[i].loss
                if num_gpu == 1:
                    # Single-GPU training
                    loss_worker += nets[i].get_l2_loss()
                elif i == 1:
                    # We only compute L2 grads in the second GPU for speed.
                    # In this case, L2 grads should x numGPU to keep the equivalence
                    loss_worker += num_gpu * nets[i].get_l2_loss()
                tower_grads.append(
                    optimizer.compute_gradients(
                        loss_worker, colocate_gradients_with_ops=True))

                if i == num_gpu - 1:
                    print('Testing model on GPU %d' % gpu_list[i])
                    if num_gpu == 1:
                        tf.get_variable_scope().reuse_variables()

                    batch_test = iterator_test.get_next()
                    nets.append(
                        net(batch_test[0], batch_test[1], is_training=False))
                    error_batch_test = nets[-1].error
    else:
        # training with only CPU
        with tf.variable_scope(tf.get_variable_scope()), \
             tf.device('/cpu:0'):
            print('Training model on CPU')
            if mode == 'speed_net':
                # use fake data to test the computation speed on GPU
                batch_train = iterator_train.get_next()
                shape_x = [batch_size // num_gpu
                           ] + batch_train[0].get_shape().as_list()[1:]
                shape_y = [batch_size // num_gpu
                           ] + batch_train[1].get_shape().as_list()[1:]

                batch_train_x = tf.zeros(shape_x, dtype=tf.float32)
                batch_train_y = tf.zeros(shape_y, dtype=tf.float32)
                batch_train = [batch_train_x, batch_train_y]
            else:
                batch_train = iterator_train.get_next()

            nets.append(net(batch_train[0], batch_train[1], is_training=True))

            tower_losses.append(nets[-1].loss)
            tower_errors.append(nets[-1].error)

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            nets[-1].total_parameters()
            nets[-1].total_MACs()

            loss_worker = nets[-1].loss + nets[-1].get_l2_loss()
            tower_grads.append(
                optimizer.compute_gradients(loss_worker,
                                            colocate_gradients_with_ops=True))

            print('Testing model on CPU')
            tf.get_variable_scope().reuse_variables()

            batch_test = iterator_test.get_next()
            nets.append(net(batch_test[0], batch_test[1], is_training=False))
            error_batch_test = nets[-1].error

    with tf.device('/cpu:0' if is_cpu_ps else worker):
        grad_batch_train = aggregate_gradients(tower_grads)
        loss_batch_train = aggregate_statistics(tower_losses)
        error_batch_train = aggregate_statistics(tower_errors)

        with tf.control_dependencies(update_ops):
            train_op = optimizer.apply_gradients(grad_batch_train,
                                                 global_step=learning_step)

    ####################################################################################################

    sess = get_session(gpu_list)
    saver = tf.train.Saver(max_to_keep=None)

    def evaluate():
        error_test = 0.
        for _ in tqdm(range(num_batch_test),
                      desc='Test',
                      leave=False,
                      smoothing=0.1):
            error_test += sess.run([error_batch_test])[0]
        return error_test / num_batch_test

    def load_model(path):
        print('Loading model from', path)
        saver.restore(sess, path)

    def save_model(path):
        saver.save(sess, path)
        print('S', end='')

    if path_load is not None:
        load_model(path_load)
        error_test_best = evaluate()
        print('Test: %.4f' % error_test_best)

    if mode == 'test':
        exit(0)

    if mode == 'export':
        vars_list = get_variable('batchnorm/gamma:')
        vars_numpy = sess.run(vars_list)
        export(vars_numpy, 'gamma')
        exit(0)

    if mode == 'restart':
        sess.run(epoch_step.assign(90))

    print_line()

    ####################################################################################################

    while True:
        # update learning rate
        lr_epoch = sess.run(lr_decay)
        if lr_epoch <= 0:
            break
            # sess.run(epoch_step.assign(1))
        epoch = sess.run(epoch_step)
        print('Epoch: %03d' % epoch, end=' ')

        loss_epoch = 0.
        error_epoch = 0.
        t0 = get_time()
        for batch in tqdm(range(num_batch_train),
                          desc='Epoch: %03d' % epoch,
                          leave=False,
                          smoothing=0.1):

            if mode == 'debug':
                print('DEBUG: '),
                _, loss_delta, error_delta, H, W, gradsH, gradsW, label_ = sess.run(
                    [
                        train_op, loss_batch_train, error_batch_train,
                        nets[0].H, nets[0].W, nets[0].grads_H, nets[0].grads_W,
                        nets[0].y
                    ])
            else:
                _, loss_delta, error_delta = sess.run(
                    [train_op, loss_batch_train, error_batch_train])

            loss_epoch += loss_delta
            error_epoch += error_delta

        print('Loss: %.6f Train: %.4f' %
              (loss_epoch / num_batch_train, error_epoch / num_batch_train),
              end=' ')
        FPS = num_batch_train * batch_size / (get_time() - t0)

        error_test = evaluate()
        assert error_test > 1e-4, (
            'Invalid test error %f, something goes wrong' % error_test)
        print('Test: %.4f lr: %.4f FPS: %d' % (error_test, lr_epoch, FPS),
              end=' ')

        sess.run(epoch_step.assign(epoch + 1))

        if epoch == 1:
            error_test_best = min(error_test, 0.9)
        if error_test < error_test_best:
            print('B', end=' ')
            if path_save is not None:
                save_model(path_save)
            error_test_best = error_test

        print('')

    print_line()

    ####################################################################################################

    sess.close()
    print('Optimization ended at ' + get_time('%y-%m-%d %X'))
    return 0