Beispiel #1
0
def Ada_Mom(loss, parameter_list):

    mu = 0.9  # the parameter of the momentum, always be 0.9
    opt = GradientDescentOptimizer(1e-3)
    grads_and_vars = opt.compute_gradients(loss, parameter_list)
    capped_grads_and_vars = []

    for i in range(len(grads_and_vars)):
        gradient = grads_and_vars[i][0]
        variable = grads_and_vars[i][1]
        if len(last_n1) != 0:
            n = tf.multiply(gradient, gradient) + last_n1[i]
            momentum = 0.9 * last_momentum1[i] + gradient/(tf.sqrt(n) + 0.001)
            capped_grads_and_vars.append((momentum, variable))
        else:
            n = tf.multiply(gradient, gradient)
            momentum = gradient / (tf.sqrt(n) + 0.001)
            capped_grads_and_vars.append((momentum, variable))

    if len(last_momentum1) != 0:
        for i in range(len(grads_and_vars)):
            last_momentum1[i] = capped_grads_and_vars[i][0]
    else:
        for i in range(len(grads_and_vars)):
            last_momentum1.append(capped_grads_and_vars[i][0])

    if len(last_n1) != 0:
        for i in range(len(grads_and_vars)):
            last_n1[i] = capped_grads_and_vars[i][0]
    else:
        for i in range(len(grads_and_vars)):
            last_n1.append(capped_grads_and_vars[i][0])

    return opt.apply_gradients(grads_and_vars)
Beispiel #2
0
def RMSProp(loss,parameter_list):

    opt = GradientDescentOptimizer(1e-3)
    grads_and_vars = opt.compute_gradients(loss, parameter_list)
    capped_grads_and_vars = []

    for i in range(len(grads_and_vars)):
        gradient = grads_and_vars[i][0]
        variable = grads_and_vars[i][1]
        if len(last_n2) != 0:
            n = 0.8 * tf.multiply(gradient, gradient) + 0.2 * last_n2[i]
            momentum = gradient/(tf.sqrt(n) + 0.001)
            capped_grads_and_vars.append((momentum, variable))
        else:
            n = tf.multiply(gradient, gradient)
            momentum = gradient / (tf.sqrt(n) + 0.001)
            capped_grads_and_vars.append((momentum, variable))

    if len(last_n2) != 0:
        for i in range(len(grads_and_vars)):
            last_n2[i] = capped_grads_and_vars[i][0]
    else:
        for i in range(len(grads_and_vars)):
            last_n2.append(capped_grads_and_vars[i][0])

    return opt.apply_gradients(grads_and_vars)
Beispiel #3
0
def RMSProp_BB2(loss, parameter_list, learning_rate2):

    opt = GradientDescentOptimizer(learning_rate2)
    grads_and_vars = opt.compute_gradients(loss, parameter_list)
    capped_grads_and_vars = []
    middle = []

    for i in range(len(grads_and_vars)):
        gradient = grads_and_vars[i][0]
        variable = grads_and_vars[i][1]
        if len(last_n4) != 0:
            n = 0.8 * tf.multiply(gradient, gradient) + 0.2 * last_n4[i - 1]
            middle.append(n)
            momentum = gradient / (tf.sqrt(n) + 0.001)
            capped_grads_and_vars.append((momentum, variable))
        else:
            n = tf.multiply(gradient, gradient)
            middle.append(n)
            momentum = gradient / (tf.sqrt(n) + 0.001)
            capped_grads_and_vars.append((momentum, variable))

    if len(last_n4) != 0:
        for i in range(len(capped_grads_and_vars)):
            last_n4[i] = middle[i]
    else:
        for i in range(len(capped_grads_and_vars)):
            last_n4.append(middle[i])

    return opt.apply_gradients(capped_grads_and_vars)
Beispiel #4
0
def RMSProp_Mom(loss, parameter_list):

    mu = 0.9  # the parameter of the momentum, always be 0.9
    opt = GradientDescentOptimizer(1e-3)
    grads_and_vars = opt.compute_gradients(loss, parameter_list)
    capped_grads_and_vars = []
    middle = []

    for i in range(len(grads_and_vars)):
        gradient = grads_and_vars[i][0]
        variable = grads_and_vars[i][1]
        if len(last_n2) != 0:
            n = 0.8 * tf.multiply(gradient, gradient) + 0.2 * last_n2[i - 1]
            middle.append(n)
            momentum = 0.9 * last_momentum2[i - 1] + gradient / (tf.sqrt(n) +
                                                                 0.001)
            capped_grads_and_vars.append((momentum, variable))
        else:
            n = tf.multiply(gradient, gradient)
            middle.append(n)
            momentum = gradient / (tf.sqrt(n) + 0.001)
            capped_grads_and_vars.append((momentum, variable))

    if len(last_momentum2) != 0:
        for i in range(len(capped_grads_and_vars)):
            last_momentum2[i] = capped_grads_and_vars[i][0]
    else:
        for i in range(len(capped_grads_and_vars)):
            last_momentum2.append(capped_grads_and_vars[i][0])

    if len(last_n2) != 0:
        for i in range(len(capped_grads_and_vars)):
            last_n2[i] = middle[i]
    else:
        for i in range(len(capped_grads_and_vars)):
            last_n2.append(middle[i])

    return opt.apply_gradients(capped_grads_and_vars)
def model_train(para):
    sess = tf.Session()
    tf.set_random_seed(random_seed)
    n = len(layers)
    x = tf.placeholder(tf.float32, [None, 784])  # input
    label = tf.placeholder(tf.float32, [None, 10])  # true label
    std = para['std']

    w, b = [0 for i in range(n)], [0 for i in range(n)]
    for i in range(1, n):
        w[i] = weight_variable([layers[i - 1], layers[i]])
        b[i] = bias_variable([layers[i]])

    # model with noise
    z, h = [0 for i in range(n)], [0 for i in range(n)]
    for i in range(n):
        if i == 0:
            z[i] = x
            z[i] += tf.random_normal(shape=tf.shape(z[i]),
                                     mean=0.0,
                                     stddev=std[i],
                                     dtype=tf.float32)
            z[i] = tf.clip_by_value(z[i], 0, 1)
            h[i] = z[i]
        if i > 0 and i < n - 1:
            z[i] = tf.matmul(h[i - 1], w[i]) + b[i]
            #z[i] = tf.clip_by_norm(z[i], 1, axes = 1)
            z[i] += tf.random_normal(shape=tf.shape(z[i]),
                                     mean=0.0,
                                     stddev=std[i],
                                     dtype=tf.float32)
            h[i] = tf.nn.relu(z[i])
        if i == n - 1:
            z[i] = tf.matmul(h[i - 1], w[i]) + b[i]
            #z[i] = tf.clip_by_norm(z[i], 1000, axes = 1)
            z[i] += tf.random_normal(shape=tf.shape(z[i]),
                                     mean=0.0,
                                     stddev=std[i],
                                     dtype=tf.float32)
            h[i] = z[i]
    y = h[n - 1]

    w_sum = tf.constant(0, dtype='float32')
    for i in range(1, n):
        w_sum += tf.reduce_sum(tf.square(w[i]))

    # gradient descent
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=y))
    gw, gb = [0 for i in range(n)], [0 for i in range(n)]
    for i in range(1, n):
        gw[i] = tf.gradients(loss, w[i])[0]
        gb[i] = tf.gradients(loss, b[i])[0]
    opt = GradientDescentOptimizer(learning_rate=learning_rate)
    gradients = []
    for i in range(1, n):
        gradients.append((gw[i], w[i]))
        gradients.append((gb[i], b[i]))
    train_step = opt.apply_gradients(gradients)

    # model without noise
    z2, h2 = [0 for i in range(n)], [0 for i in range(n)]
    for i in range(n):
        if i == 0:
            z2[i] = x
            h2[i] = z2[i]
        if i > 0 and i < n - 1:
            z2[i] = tf.matmul(h2[i - 1], w[i]) + b[i]
            h2[i] = tf.nn.relu(z2[i])
        if i == n - 1:
            z2[i] = tf.matmul(h2[i - 1], w[i]) + b[i]
            h2[i] = z2[i]
    y2 = h2[n - 1]

    # attack
    x_adv = attack.fgsm(x, y2, eps=0.3, clip_min=0, clip_max=1)

    #evaluation
    acc = tf.reduce_mean(
        tf.cast(tf.equal(tf.argmax(y2, 1), tf.argmax(label, 1)), tf.float32))

    # data
    mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
    x_adv_mnist_fsgm = np.load(os.path.join('data', 'x_adv_mnist_fsgm.npy'))

    sess.run(tf.global_variables_initializer())
    with sess.as_default():
        for t in range(1, steps + 1):
            batch = mnist.train.next_batch(batch_size)
            sess.run(train_step, feed_dict={x: batch[0], label: batch[1]})
            if t % int(1 / sample_rate) == 0:
                epoch = int(t / int(1 / sample_rate))

                x_adv_sample = sess.run(x_adv,
                                        feed_dict={
                                            x: mnist.test.images,
                                            label: mnist.test.labels
                                        })
                acc_benign = sess.run(acc,
                                      feed_dict={
                                          x: mnist.test.images,
                                          label: mnist.test.labels
                                      })
                acc_adv = sess.run(acc,
                                   feed_dict={
                                       x: x_adv_sample,
                                       label: mnist.test.labels
                                   })
                acc_pre_adv = sess.run(acc,
                                       feed_dict={
                                           x: x_adv_mnist_fsgm,
                                           label: mnist.test.labels
                                       })
                print(epoch, acc_benign, acc_adv, acc_pre_adv)
                check = tf.reduce_mean(tf.norm(y2, axis=1))
                print(
                    sess.run([check],
                             feed_dict={
                                 x: mnist.test.images,
                                 label: mnist.test.labels
                             }))
Beispiel #6
0
def main(_):
    clip_bound = 0.01  # 'the clip bound of the gradients'
    clip_bound_2 = 1 / 1.5  #'the clip bound for r_kM'

    small_num = 1e-5  # 'a small number'
    large_num = 1e5  # a large number'
    num_images = 60000  # 'number of images N'

    batch_size = 600  # 'batch_size L'
    sample_rate = 600 / 60000  # 'sample rate q = L / N'
    num_steps = 160000  # 'number of steps T = E * N / L = E / q'
    num_epoch = 24  # 'number of epoches E'

    sigma = 5  # 'sigma'
    delta = 1e-5  # 'delta'

    lambd = 1e3  # 'exponential distribution parameter'

    iterative_clip_step = 2  # 'iterative_clip_step'

    clip = 1  # 'whether to clip the gradient'
    noise = 0  # 'whether to add noise'
    redistribute = 0  # 'whether to redistribute the noise'

    D = 60000
    '''from tensorflow.examples.tutorials.mnist import input_data;
  mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True);'''

    sess = tf.InteractiveSession()

    # Create the model
    x = tf.placeholder(tf.float32, [None, 784])
    x_image = tf.reshape(x, [-1, 28, 28, 1])
    W_conv1 = weight_variable([5, 5, 1, 32])
    b_conv1 = bias_variable([32])
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    h_pool1 = max_pool_2x2(h_conv1)

    W_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    h_pool2 = max_pool_2x2(h_conv2)

    W_fc1 = weight_variable([7 * 7 * 64, 25])
    b_fc1 = bias_variable([25])
    h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

    keep_prob = tf.placeholder(tf.float32)
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

    W_fc2 = weight_variable([25, 10])
    b_fc2 = bias_variable([10])
    y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

    # Define loss and optimizer
    y_ = tf.placeholder(tf.float32, [None, 10])

    d = 25 * 10 + 25 * 7 * 7 * 64 + 5 * 5 * 32 * 64 + 5 * 5 * 32
    # number of parameters
    M = d

    priv_accountant = accountant.GaussianMomentsAccountant(D)
    privacy_accum_op = priv_accountant.accumulate_privacy_spending(
        [None, None], sigma, batch_size)

    #sess.run(tf.initialize_all_variables())
    sess.run(tf.global_variables_initializer())

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
    #train_step = tf.train.AdamOptimizer(1e-5).minimize(cross_entropy);
    #train_step = tf.train.AdamOptimizer(1e-5).minimize(cross_entropy)

    opt = GradientDescentOptimizer(learning_rate=1e-2)

    #compute gradient
    gw_W1 = tf.gradients(cross_entropy, W_conv1)[0]  # gradient of W1
    gb1 = tf.gradients(cross_entropy, b_conv1)[0]  # gradient of b1

    gw_W2 = tf.gradients(cross_entropy, W_conv2)[0]  # gradient of W2
    gb2 = tf.gradients(cross_entropy, b_conv2)[0]  # gradient of b2

    gw_Wf1 = tf.gradients(cross_entropy, W_fc1)[0]  # gradient of W_fc1
    gbf1 = tf.gradients(cross_entropy, b_fc1)[0]  # gradient of b_fc1

    gw_Wf2 = tf.gradients(cross_entropy, W_fc2)[0]  # gradient of W_fc2
    gbf2 = tf.gradients(cross_entropy, b_fc2)[0]  # gradient of b_fc2

    #clip gradient
    gw_W1 = tf.clip_by_norm(gw_W1, clip_bound)
    gw_W2 = tf.clip_by_norm(gw_W2, clip_bound)
    gw_Wf1 = tf.clip_by_norm(gw_Wf1, clip_bound)
    gw_Wf2 = tf.clip_by_norm(gw_Wf2, clip_bound)

    #sigma = FLAGS.sigma # when comp_eps(lmbda,q,sigma,T,delta)==epsilon

    #sensitivity = 2 * FLAGS.clip_bound #adjacency matrix with one tuple different
    sensitivity = clip_bound  #adjacency matrix with one more tuple

    gw_W1 += tf.random_normal(shape=tf.shape(gw_W1),
                              mean=0.0,
                              stddev=(sigma * sensitivity)**2,
                              dtype=tf.float32)
    gb1 += tf.random_normal(shape=tf.shape(gb1),
                            mean=0.0,
                            stddev=(sigma * sensitivity)**2,
                            dtype=tf.float32)
    gw_W2 += tf.random_normal(shape=tf.shape(gw_W2),
                              mean=0.0,
                              stddev=(sigma * sensitivity)**2,
                              dtype=tf.float32)
    gb2 += tf.random_normal(shape=tf.shape(gb2),
                            mean=0.0,
                            stddev=(sigma * sensitivity)**2,
                            dtype=tf.float32)
    gw_Wf1 += tf.random_normal(shape=tf.shape(gw_Wf1),
                               mean=0.0,
                               stddev=(sigma * sensitivity)**2,
                               dtype=tf.float32)
    gbf1 += tf.random_normal(shape=tf.shape(gbf1),
                             mean=0.0,
                             stddev=(sigma * sensitivity)**2,
                             dtype=tf.float32)
    gw_Wf2 += tf.random_normal(shape=tf.shape(gw_Wf2),
                               mean=0.0,
                               stddev=(sigma * sensitivity)**2,
                               dtype=tf.float32)
    gbf2 += tf.random_normal(shape=tf.shape(gbf2),
                             mean=0.0,
                             stddev=(sigma * sensitivity)**2,
                             dtype=tf.float32)

    train_step = opt.apply_gradients([(gw_W1, W_conv1), (gb1, b_conv1),
                                      (gw_W2, W_conv2), (gb2, b_conv2),
                                      (gw_Wf1, W_fc1), (gbf1, b_fc1),
                                      (gw_Wf2, W_fc2), (gbf2, b_fc2)])

    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    start_time = time.time()
    for i in range(num_steps):
        batch = mnist.train.next_batch(batch_size)
        train_step.run(feed_dict={
            x: batch[0],
            y_: batch[1],
            keep_prob: 0.5
        })

        if i % 100 == 0:
            #train_accuracy = accuracy.eval(feed_dict={x:batch[0], y_: batch[1], keep_prob: 1.0});
            #print("step \t %d \t training accuracy \t %g"%(i, train_accuracy));
            print("step \t %d \t test accuracy \t %g" %
                  (i,
                   accuracy.eval(feed_dict={
                       x: mnist.test.images,
                       y_: mnist.test.labels,
                       keep_prob: 1.0
                   })))
            #epsilon = comp_eps(32, sample_rate, sigma, i, delta)
            #print("epsilon: {}".format(epsilon))
        sess.run([privacy_accum_op])
        spent_eps_deltas = priv_accountant.get_privacy_spent(
            sess, target_eps=target_eps)
        #print(i, spent_eps_deltas)
        _break = False
        for _eps, _delta in spent_eps_deltas:
            if _delta >= delta:
                _break = True
                break
        if _break == True:
            break
    duration = time.time() - start_time
    print("test accuracy %g" % accuracy.eval(feed_dict={
        x: mnist.test.images,
        y_: mnist.test.labels,
        keep_prob: 1.0
    }))
    print(float(duration))
def model_train(para):
    sigma = compute_sigma(para['eps'], para['delta'])
    std = sigma * para['sens']

    sess = tf.Session()
    tf.set_random_seed(random_seed)
    n = len(layers)
    x = tf.placeholder(tf.float32, [None, 784])  # input
    label = tf.placeholder(tf.float32, [None, 10])  # true label

    w, b = [0 for i in range(n)], [0 for i in range(n)]
    for i in range(1, n):
        w[i] = weight_variable([layers[i - 1], layers[i]])
        b[i] = bias_variable([layers[i]])

    # noisy model
    z, h = [0 for i in range(n)], [0 for i in range(n)]
    h[0] = x
    h[0] = h[0] + tf.random_normal(
        shape=tf.shape(h[0]), mean=0.0, stddev=std, dtype=tf.float32)
    for i in range(1, n):
        z[i] = tf.matmul(h[i - 1], w[i]) + b[i]
        if i < n - 1:
            h[i] = tf.nn.relu(z[i])
        else:
            h[i] = z[i]
    y = h[n - 1]
    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=y))

    # noiseless model
    z2, h2 = [0 for i in range(n)], [0 for i in range(n)]
    h2[0] = x
    for i in range(1, n):
        z2[i] = tf.matmul(h2[i - 1], w[i]) + b[i]
        if i < n - 1:
            h2[i] = tf.nn.relu(z2[i])
        else:
            h2[i] = z2[i]
    y2 = h2[n - 1]

    x_adv = attack.fgsm(x, y, eps=0.3, clip_min=0, clip_max=1)

    # gradient descent
    gw, gb = [0 for i in range(n)], [0 for i in range(n)]
    for i in range(1, n):
        gw[i] = tf.gradients(loss, w[i])[0]
        gb[i] = tf.gradients(loss, b[i])[0]
    opt = GradientDescentOptimizer(learning_rate=learning_rate)
    gradients = []
    for i in range(1, n):
        gradients.append((gw[i], w[i]))
        gradients.append((gb[i], b[i]))
    train_step = opt.apply_gradients(gradients)

    #evaluation
    acc = tf.reduce_mean(
        tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(label, 1)), tf.float32))

    # data
    mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
    x_adv_mnist_fsgm = np.load(os.path.join('data', 'x_adv_mnist_fsgm.npy'))

    print('sigma: {:.3f}, std: {:.3f}'.format(sigma, std))
    sess.run(tf.global_variables_initializer())
    with sess.as_default():
        for t in range(steps):
            batch = mnist.train.next_batch(batch_size)
            sess.run(train_step, feed_dict={x: batch[0], label: batch[1]})
            if t % int(1 / sample_rate) == 0 or t == steps - 1:
                if t < steps - 1:
                    epoch = int(t / int(1 / sample_rate))
                else:
                    epoch = epochs

                x_adv_sample = sess.run(x_adv,
                                        feed_dict={
                                            x: mnist.test.images,
                                            label: mnist.test.labels
                                        })
                acc_benign = sess.run(acc,
                                      feed_dict={
                                          x: mnist.test.images,
                                          label: mnist.test.labels
                                      })
                acc_adv = sess.run(acc,
                                   feed_dict={
                                       x: x_adv_sample,
                                       label: mnist.test.labels
                                   })
                acc_pre_adv = sess.run(acc,
                                       feed_dict={
                                           x: x_adv_mnist_fsgm,
                                           label: mnist.test.labels
                                       })

                print(epoch, acc_benign, acc_adv, acc_pre_adv)
Beispiel #8
0
def train(fgsm_eps, _dp_epsilon, _attack_norm_bound, log_filename, ratio):
    FLAGS = None

    #ratio = 16
    #target_eps = [0.125,0.25,0.5,1,2,4,8]
    #target_eps = [0.25 + 0.25*ratio]
    target_eps = [0.2 + 0.2 * ratio]
    #print(target_eps[0])
    #fgsm_eps = 0.1
    dp_epsilon = _dp_epsilon
    image_size = 28
    _log_filename = log_filename + str(target_eps[0]) + '_fgsm_' + str(
        fgsm_eps) + '_dpeps_' + str(dp_epsilon) + '_attack_norm_bound_' + str(
            _attack_norm_bound) + '.txt'

    clip_bound = 0.001  # 'the clip bound of the gradients'
    clip_bound_2 = 1 / 1.5  # 'the clip bound for r_kM'

    small_num = 1e-5  # 'a small number'
    large_num = 1e5  # a large number'
    num_images = 50000  # 'number of images N'

    batch_size = 125  # 'batch_size L'
    sample_rate = batch_size / 50000  # 'sample rate q = L / N'
    # 900 epochs
    num_steps = 1800000  # 'number of steps T = E * N / L = E / q'
    num_epoch = 24  # 'number of epoches E'

    sigma = 5  # 'sigma'
    delta = 1e-5  # 'delta'

    lambd = 1e3  # 'exponential distribution parameter'

    iterative_clip_step = 2  # 'iterative_clip_step'

    clip = 1  # 'whether to clip the gradient'
    noise = 0  # 'whether to add noise'
    redistribute = 0  # 'whether to redistribute the noise'

    D = 50000

    sess = tf.InteractiveSession()

    # Create the model
    x = tf.placeholder(tf.float32, [None, 784])
    y_ = tf.placeholder(tf.float32, [None, 10])
    keep_prob = tf.placeholder(tf.float32)

    W_conv1 = weight_variable([5, 5, 1, 32])
    b_conv1 = bias_variable([32])
    W_conv2 = weight_variable([5, 5, 32, 64])
    b_conv2 = bias_variable([64])
    W_fc1 = weight_variable([7 * 7 * 64, 25])
    b_fc1 = bias_variable([25])
    W_fc2 = weight_variable([25, 10])
    b_fc2 = bias_variable([10])

    def inference(x, dp_mult):
        x_image = tf.reshape(x, [-1, 28, 28, 1])
        h_conv1 = tf.nn.relu((conv2d(x_image, W_conv1) + b_conv1) + dp_mult)
        h_pool1 = max_pool_2x2(h_conv1)
        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
        h_pool2 = max_pool_2x2(h_conv2)
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

        y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
        return y_conv, h_conv1

    def inference_prob(x):
        logits, _ = inference(x, 0)
        y_prob = tf.nn.softmax(logits)
        return y_prob

    shape = W_conv1.get_shape().as_list()
    w_t = tf.reshape(W_conv1, [-1, shape[-1]])
    w = tf.transpose(w_t)
    sing_vals = tf.svd(w, compute_uv=False)
    sensitivityW = tf.reduce_max(sing_vals)
    dp_delta = 0.05
    attack_norm_bound = _attack_norm_bound
    dp_mult = attack_norm_bound * math.sqrt(
        2 * math.log(1.25 / dp_delta)) / dp_epsilon
    noise = tf.placeholder(tf.float32, [None, 28, 28, 32])

    #y_conv, h_conv1 = inference(x, dp_mult * noise)
    y_conv, h_conv1 = inference(x, attack_norm_bound * noise)
    softmax_y = tf.nn.softmax(y_conv)
    # Define loss and optimizer

    priv_accountant = accountant.GaussianMomentsAccountant(D)
    privacy_accum_op = priv_accountant.accumulate_privacy_spending(
        [None, None], sigma, batch_size)

    # sess.run(tf.initialize_all_variables())
    sess.run(tf.global_variables_initializer())

    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
    #train_step = tf.train.AdamOptimizer(1e-5).minimize(cross_entropy);
    #train_step = tf.train.AdamOptimizer(1e-5).minimize(cross_entropy)

    # noise redistribution #
    grad, = tf.gradients(cross_entropy, h_conv1)
    normalized_grad = tf.sign(grad)
    normalized_grad = tf.stop_gradient(normalized_grad)
    normalized_grad_r = tf.abs(tf.reduce_mean(normalized_grad, axis=(0)))
    #print(normalized_grad_r)
    sum_r = tf.reduce_sum(normalized_grad_r, axis=(0, 1, 2), keepdims=False)
    #print(sum_r)
    normalized_grad_r = 256 * 32 * normalized_grad_r / sum_r
    print(normalized_grad_r)

    shape_grad = normalized_grad_r.get_shape().as_list()
    grad_t = tf.reshape(normalized_grad_r, [-1, shape_grad[-1]])
    g = tf.transpose(grad_t)
    sing_g_vals = tf.svd(g, compute_uv=False)
    sensitivity_2 = tf.reduce_max(sing_g_vals)
    ########################

    opt = GradientDescentOptimizer(learning_rate=1e-1)

    # compute gradient
    gw_W1 = tf.gradients(cross_entropy, W_conv1)[0]  # gradient of W1
    gb1 = tf.gradients(cross_entropy, b_conv1)[0]  # gradient of b1

    gw_W2 = tf.gradients(cross_entropy, W_conv2)[0]  # gradient of W2
    gb2 = tf.gradients(cross_entropy, b_conv2)[0]  # gradient of b2

    gw_Wf1 = tf.gradients(cross_entropy, W_fc1)[0]  # gradient of W_fc1
    gbf1 = tf.gradients(cross_entropy, b_fc1)[0]  # gradient of b_fc1

    gw_Wf2 = tf.gradients(cross_entropy, W_fc2)[0]  # gradient of W_fc2
    gbf2 = tf.gradients(cross_entropy, b_fc2)[0]  # gradient of b_fc2

    # clip gradient
    gw_W1 = tf.clip_by_norm(gw_W1, clip_bound)
    gw_W2 = tf.clip_by_norm(gw_W2, clip_bound)
    gw_Wf1 = tf.clip_by_norm(gw_Wf1, clip_bound)
    gw_Wf2 = tf.clip_by_norm(gw_Wf2, clip_bound)

    # sigma = FLAGS.sigma # when comp_eps(lmbda,q,sigma,T,delta)==epsilon

    # sensitivity = 2 * FLAGS.clip_bound #adjacency matrix with one tuple different
    sensitivity = clip_bound  # adjacency matrix with one more tuple

    gw_W1 += tf.random_normal(shape=tf.shape(gw_W1),
                              mean=0.0,
                              stddev=(sigma * sensitivity)**2,
                              dtype=tf.float32)
    gb1 += tf.random_normal(shape=tf.shape(gb1),
                            mean=0.0,
                            stddev=(sigma * sensitivity)**2,
                            dtype=tf.float32)
    gw_W2 += tf.random_normal(shape=tf.shape(gw_W2),
                              mean=0.0,
                              stddev=(sigma * sensitivity)**2,
                              dtype=tf.float32)
    gb2 += tf.random_normal(shape=tf.shape(gb2),
                            mean=0.0,
                            stddev=(sigma * sensitivity)**2,
                            dtype=tf.float32)
    gw_Wf1 += tf.random_normal(shape=tf.shape(gw_Wf1),
                               mean=0.0,
                               stddev=(sigma * sensitivity)**2,
                               dtype=tf.float32)
    gbf1 += tf.random_normal(shape=tf.shape(gbf1),
                             mean=0.0,
                             stddev=(sigma * sensitivity)**2,
                             dtype=tf.float32)
    gw_Wf2 += tf.random_normal(shape=tf.shape(gw_Wf2),
                               mean=0.0,
                               stddev=(sigma * sensitivity)**2,
                               dtype=tf.float32)
    gbf2 += tf.random_normal(shape=tf.shape(gbf2),
                             mean=0.0,
                             stddev=(sigma * sensitivity)**2,
                             dtype=tf.float32)

    train_step = opt.apply_gradients([(gw_W1, W_conv1), (gb1, b_conv1),
                                      (gw_W2, W_conv2), (gb2, b_conv2),
                                      (gw_Wf1, W_fc1), (gbf1, b_fc1),
                                      (gw_Wf2, W_fc2), (gbf2, b_fc2)])

    # craft adversarial samples from x for testing
    #softmax_y_test = tf.nn.softmax(y_conv)

    #====================== attack =========================

    attack_switch = {
        'fgsm': True,
        'ifgsm': True,
        'deepfool': False,
        'mim': True,
        'spsa': False,
        'cwl2': False,
        'madry': True,
        'stm': False
    }

    # define cleverhans abstract models for using cleverhans attacks
    ch_model_logits = CallableModelWrapper(callable_fn=inference,
                                           output_layer='logits')
    ch_model_probs = CallableModelWrapper(callable_fn=inference_prob,
                                          output_layer='probs')

    # define each attack method's tensor
    attack_tensor_dict = {}
    # FastGradientMethod
    if attack_switch['fgsm']:
        print('creating attack tensor of FastGradientMethod')
        fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess)
        x_adv_test_fgsm = fgsm_obj.generate(x=x,
                                            eps=fgsm_eps,
                                            clip_min=0.0,
                                            clip_max=1.0)  # testing now
        attack_tensor_dict['fgsm'] = x_adv_test_fgsm

    # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init)
    # default: eps_iter=0.05, nb_iter=10
    if attack_switch['ifgsm']:
        print('creating attack tensor of BasicIterativeMethod')
        ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess)
        x_adv_test_ifgsm = ifgsm_obj.generate(x=x,
                                              eps=fgsm_eps,
                                              eps_iter=fgsm_eps / 10,
                                              nb_iter=10,
                                              clip_min=0.0,
                                              clip_max=1.0)
        attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm

    # MomentumIterativeMethod
    # default: eps_iter=0.06, nb_iter=10
    if attack_switch['mim']:
        print('creating attack tensor of MomentumIterativeMethod')
        mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess)
        x_adv_test_mim = mim_obj.generate(x=x,
                                          eps=fgsm_eps,
                                          eps_iter=fgsm_eps / 10,
                                          nb_iter=10,
                                          decay_factor=1.0,
                                          clip_min=0.0,
                                          clip_max=1.0)
        attack_tensor_dict['mim'] = x_adv_test_mim

    # MadryEtAl (Projected Grdient with random init, same as rand+fgsm)
    # default: eps_iter=0.01, nb_iter=40
    if attack_switch['madry']:
        print('creating attack tensor of MadryEtAl')
        madry_obj = MadryEtAl(model=ch_model_probs, sess=sess)
        x_adv_test_madry = madry_obj.generate(x=x,
                                              eps=fgsm_eps,
                                              eps_iter=fgsm_eps / 10,
                                              nb_iter=10,
                                              clip_min=0.0,
                                              clip_max=1.0)
        attack_tensor_dict['madry'] = x_adv_test_madry

    #====================== attack =========================

    #Define the correct prediction and accuracy#
    correct_prediction_x = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy_x = tf.reduce_mean(tf.cast(correct_prediction_x, tf.float32))

    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    s = math.log(sqrt(2.0 / math.pi) * 1e+5)
    sigmaEGM = sqrt(2.0) * 1.0 * (sqrt(s) +
                                  sqrt(s + dp_epsilon)) / (2.0 * dp_epsilon)
    print(sigmaEGM)
    __noiseE = np.random.normal(0.0, sigmaEGM**2,
                                28 * 28 * 32).astype(np.float32)
    __noiseE = np.reshape(__noiseE, [-1, 28, 28, 32])

    start_time = time.time()
    logfile = open(_log_filename, 'w')
    last_eval_time = -1
    accum_time = 0
    accum_epoch = 0
    max_benign_acc = -1
    max_adv_acc_dict = {}
    test_size = len(mnist.test.images)
    print("Computing The Noise Redistribution Vector")
    for i in range(4000):
        batch = mnist.train.next_batch(batch_size)
        sess.run([train_step],
                 feed_dict={
                     x: batch[0],
                     y_: batch[1],
                     keep_prob: 0.5,
                     noise: __noiseE * 0
                 })
    batch = mnist.train.next_batch(batch_size * 10)
    grad_redis = sess.run([normalized_grad_r],
                          feed_dict={
                              x: batch[0],
                              y_: batch[1],
                              keep_prob: 1.0,
                              noise: __noiseE * 0
                          })
    #print(grad_redis)
    _sensitivity_2 = sess.run([sensitivity_2],
                              feed_dict={
                                  x: batch[0],
                                  y_: batch[1],
                                  keep_prob: 1.0,
                                  noise: __noiseE * 0
                              })
    #print(_sensitivity_2)

    _sensitivityW = sess.run(sensitivityW)
    #print(_sensitivityW)
    Delta_redis = _sensitivityW / sqrt(_sensitivity_2[0])
    #print(Delta_redis)
    sigmaHGM = sqrt(2.0) * Delta_redis * (sqrt(s) + sqrt(s + dp_epsilon)) / (
        2.0 * dp_epsilon)
    #print(sigmaHGM)
    __noiseH = np.random.normal(0.0, sigmaHGM**2,
                                28 * 28 * 32).astype(np.float32)
    __noiseH = np.reshape(__noiseH, [-1, 28, 28, 32]) * grad_redis

    sess.run(tf.global_variables_initializer())
    print("Training")
    for i in range(num_steps):
        batch = mnist.train.next_batch(batch_size)
        sess.run(
            [train_step],
            feed_dict={
                x: batch[0],
                y_: batch[1],
                keep_prob: 0.5,
                noise: (__noiseE + __noiseH) / 2
            })
        sess.run([privacy_accum_op])
        spent_eps_deltas = priv_accountant.get_privacy_spent(
            sess, target_eps=target_eps)
        if i % 1000 == 0:
            print(i, spent_eps_deltas)
        _break = False
        for _eps, _delta in spent_eps_deltas:
            if _delta >= delta:
                _break = True
                break
        if _break == True:
            break
    print("Testing")
    benign_acc = accuracy_x.eval(
        feed_dict={
            x: mnist.test.images,
            y_: mnist.test.labels,
            keep_prob: 1.0,
            noise: (__noiseE + __noiseH) / 2
        })
    ### PixelDP Robustness ###
    adv_acc_dict = {}
    robust_adv_acc_dict = {}
    robust_adv_utility_dict = {}
    for atk in attack_switch.keys():
        if atk not in adv_acc_dict:
            adv_acc_dict[atk] = -1
            robust_adv_acc_dict[atk] = -1
            robust_adv_utility_dict[atk] = -1

        if attack_switch[atk]:
            adv_images_dict = sess.run(attack_tensor_dict[atk],
                                       feed_dict={
                                           x: mnist.test.images,
                                           y_: mnist.test.labels,
                                           keep_prob: 1.0
                                       })
            #grad_redis = sess.run([normalized_grad_r], feed_dict={x: adv_images_dict, y_: mnist.test.labels, keep_prob: 1.0, noise:__noise})
            ### Robustness ###
            predictions_form_argmax = np.zeros([test_size, 10])
            softmax_predictions = softmax_y.eval(
                feed_dict={
                    x: adv_images_dict,
                    keep_prob: 1.0,
                    noise: (__noiseE + __noiseH) / 2
                })
            argmax_predictions = np.argmax(softmax_predictions, axis=1)
            for n_draws in range(0, 2000):
                if n_draws % 1000 == 0:
                    print(n_draws)
                _noiseE = np.random.normal(0.0, sigmaEGM**2,
                                           28 * 28 * 32).astype(np.float32)
                _noiseE = np.reshape(_noiseE, [-1, 28, 28, 32])
                _noise = np.random.normal(0.0, sigmaHGM**2,
                                          28 * 28 * 32).astype(np.float32)
                _noise = np.reshape(_noise, [-1, 28, 28, 32]) * grad_redis
                for j in range(test_size):
                    pred = argmax_predictions[j]
                    predictions_form_argmax[j, pred] += 1
                softmax_predictions = softmax_y.eval(
                    feed_dict={
                        x: adv_images_dict,
                        keep_prob: 1.0,
                        noise: (__noiseE + __noiseH) / 2 +
                        (_noiseE + _noise) / 4
                    })
                argmax_predictions = np.argmax(softmax_predictions, axis=1)
            final_predictions = predictions_form_argmax
            is_correct = []
            is_robust = []
            for j in range(test_size):
                is_correct.append(
                    np.argmax(mnist.test.labels[j]) == np.argmax(
                        final_predictions[j]))
                robustness_from_argmax = robustnessGGaussian.robustness_size_argmax(
                    counts=predictions_form_argmax[j],
                    eta=0.05,
                    dp_attack_size=fgsm_eps,
                    dp_epsilon=dp_epsilon,
                    dp_delta=1e-5,
                    dp_mechanism='gaussian') / dp_mult
                is_robust.append(robustness_from_argmax >= fgsm_eps)
            adv_acc_dict[atk] = np.sum(is_correct) * 1.0 / test_size
            robust_adv_acc_dict[atk] = np.sum([
                a and b for a, b in zip(is_robust, is_correct)
            ]) * 1.0 / np.sum(is_robust)
            robust_adv_utility_dict[atk] = np.sum(is_robust) * 1.0 / test_size
            print(" {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(
                atk, adv_acc_dict[atk], robust_adv_acc_dict[atk],
                robust_adv_utility_dict[atk],
                robust_adv_acc_dict[atk] * robust_adv_utility_dict[atk]))
            ##############################
    log_str = "step: {}\t target_epsilon: {}\t dp_epsilon: {:.1f}\t attack_norm_bound: {:.1f}\t benign_acc: {:.4f}\t".format(
        i, target_eps, dp_epsilon, attack_norm_bound, benign_acc)
    for atk in attack_switch.keys():
        if attack_switch[atk]:
            log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(
                atk, adv_acc_dict[atk], robust_adv_acc_dict[atk],
                robust_adv_utility_dict[atk],
                robust_adv_acc_dict[atk] * robust_adv_utility_dict[atk])
    print(log_str)
    logfile.write(log_str + '\n')
    ##############################
    duration = time.time() - start_time
    logfile.write(str(duration) + '\n')
    logfile.flush()
    logfile.close()
Beispiel #9
0
def training_embedding(reverse_dictionary, with_dp=False):
    """
    # training with DP
    :param with_dp:
    :return:
    """
    batch_size = 128
    embedding_size = 300  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.
    # We pick a random validation set to sample nearest neighbors. here we limit the
    # validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent.
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.array(random.sample(range(valid_window), valid_size))
    num_sampled = 64  # Number of negative examples to sample.

    learning_rate = 1

    # DP parameters
    clip_bound = 0.01  # 'the clip bound of the gradients'
    # num_steps = 160000  # 'number of steps T = E * N / L = E / q'
    sigma = 5  # 'sigma'
    delta = 1e-5  # 'delta'

    sess = tf.InteractiveSession()

    graph = tf.Graph()
    avg_loss_arr = []
    loss_arr = []
    # with graph.as_default(), tf.device('/cpu:0'):
    # Input data.
    with tf.device('/gpu:0'):
        train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        # Variables.
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)

    if FLAGS.with_nce_loss:
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        cross_entropy = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_sampled,
                           num_classes=vocabulary_size))
    else:
        with tf.device('/gpu:0'):
            softmax_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size],
                                    stddev=1.0 / math.sqrt(embedding_size)))
            softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
            # Compute the softmax loss, using a sample of the negative labels each time.
            # Read more: https://stackoverflow.com/questions/37671974/tensorflow-negative-sampling
            # When we want to compute the softmax probability for your true label,
            # we compute: logits[true_label] / sum(logits[negative_sampled_labels]
            # Other candidate sampling: https://www.tensorflow.org/extras/candidate_sampling.pdf
        cross_entropy = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(weights=softmax_weights,
                                       biases=softmax_biases,
                                       inputs=embed,
                                       labels=train_labels,
                                       num_sampled=num_sampled,
                                       num_classes=vocabulary_size))

    priv_accountant = accountant.GaussianMomentsAccountant(vocabulary_size)
    privacy_accum_op = priv_accountant.accumulate_privacy_spending(
        [None, None], sigma, batch_size)

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    # optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)

    optimizer = GradientDescentOptimizer(learning_rate)
    if FLAGS.optimizer == "adam":
        # cannot use adam so far. Tested and the model couldn't converge.
        optimizer = AdamOptimizer(learning_rate)
        print("##INFO: Using adam optimizer")
    if FLAGS.optimizer == "adagrad":
        # cannot use adam so far. Tested and the model couldn't converge.
        optimizer = AdagradOptimizer(learning_rate)
        print("##INFO: Using adagrad optimizer")

    log_dir = os.path.join(FLAGS.trained_models, "logs")

    # compute gradient
    if FLAGS.with_nce_loss:
        gw_Embeddings = tf.gradients(cross_entropy,
                                     embeddings)[0]  # gradient of embeddings
        gw_softmax_weights = tf.gradients(
            cross_entropy, nce_weights)[0]  # gradient of nce_weights
        gb_softmax_biases = tf.gradients(
            cross_entropy, nce_biases)[0]  # gradient of nce_biases
    else:
        with tf.device('/gpu:0'):
            gw_Embeddings = tf.gradients(
                cross_entropy, embeddings)[0]  # gradient of embeddings
            gw_softmax_weights = tf.gradients(
                cross_entropy,
                softmax_weights)[0]  # gradient of softmax_weights
            gb_softmax_biases = tf.gradients(
                cross_entropy, softmax_biases)[0]  # gradient of softmax_biases

    # clip gradient
    if FLAGS.clip_by_norm:
        # faster but takes more epochs to train
        with tf.device('/gpu:0'):
            gw_Embeddings = tf.clip_by_norm(gw_Embeddings, clip_bound)
            gw_softmax_weights = tf.clip_by_norm(gw_softmax_weights,
                                                 clip_bound)
            gb_softmax_biases = tf.clip_by_norm(gb_softmax_biases, clip_bound)
    else:
        # dp-sgd: slow and require more memory but converge faster, take less epochs.
        gw_Embeddings = utils.BatchClipByL2norm(gw_Embeddings, clip_bound)
        gw_softmax_weights = utils.BatchClipByL2norm(gw_softmax_weights,
                                                     clip_bound)
        gb_softmax_biases = utils.BatchClipByL2norm(gb_softmax_biases,
                                                    clip_bound)

    sensitivity = clip_bound  # adjacency matrix with one more tuple

    # Add noise
    if FLAGS.with_dp:
        gw_Embeddings += tf.random_normal(shape=tf.shape(gw_Embeddings),
                                          mean=0.0,
                                          stddev=sigma * (sensitivity**2),
                                          dtype=tf.float32)
        gw_softmax_weights += tf.random_normal(
            shape=tf.shape(gw_softmax_weights),
            mean=0.0,
            stddev=sigma * (sensitivity**2),
            dtype=tf.float32)
        gb_softmax_biases += tf.random_normal(
            shape=tf.shape(gb_softmax_biases),
            mean=0.0,
            stddev=sigma * (sensitivity**2),
            dtype=tf.float32)

    if FLAGS.with_nce_loss:
        train_step = optimizer.apply_gradients([
            (gw_Embeddings, embeddings), (gw_softmax_weights, nce_weights),
            (gb_softmax_biases, nce_biases)
        ])
    else:
        train_step = optimizer.apply_gradients([
            (gw_Embeddings, embeddings), (gw_softmax_weights, softmax_weights),
            (gb_softmax_biases, softmax_biases)
        ])

    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    with tf.device('/gpu:0'):
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(valid_embeddings,
                           tf.transpose(normalized_embeddings))

    min_loss = 10**4
    per_dec_count = 0

    print('Initialized')
    average_loss = 0

    running = True
    step = 0
    average_loss_arr = []
    saving_pointer_idx = 0

    # put it here because Adam has its own variables.
    sess.run(tf.global_variables_initializer())

    # saver must be used after global_variables_initializer
    saver = tf.train.Saver()

    # Save the variables to disk.
    save_path = os.path.join(FLAGS.trained_models, "initialized_model.ckpt")
    # Sonvx: we need to make sure initialized variables are all the same for different tests.
    print("Checking on path: ", save_path)
    if not os.path.isfile(save_path + ".index"):
        saved_info = saver.save(sess, save_path)
        print("Global initialized model saved in file: %s" % saved_info)
    else:
        saver.restore(sess, save_path)
        print("Restored the global initialized model.")
    if FLAGS.DEBUG:
        input(
            "Double check whether or not the initialized model got restored then <Press enter>"
        )
    print('###INFO: Initialized in run(graph)')

    if FLAGS.RESTORE_LAST_CHECK_POINT:
        checkpoint_path = os.path.join(log_dir, "model.ckpt")
        if os.path.isfile(checkpoint_path + ".index"):
            saver.restore(sess, checkpoint_path)
            print("Restored the latest checkpoint at %s." % (checkpoint_path))

    while running:
        # for step in range(num_steps):
        batch_data, batch_labels = generate_batch(batch_size, num_skips,
                                                  skip_window)
        print("Global data_index = ", data_index)
        # feed_dict = {train_dataset: batch_data, train_labels: batch_labels}

        # old: sess.run([optimizer, cross_entropy], feed_dict=feed_dict)
        # template: train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5});
        train_step.run(feed_dict={
            train_dataset: batch_data,
            train_labels: batch_labels
        })
        loss = cross_entropy.eval(feed_dict={
            train_dataset: batch_data,
            train_labels: batch_labels
        })

        # loss_arr.append(l)
        # average_loss += l
        # current_avg_loss = average_loss/step
        # avg_loss_arr.append(current_avg_loss)

        sess.run([privacy_accum_op])
        # print(step, spent_eps_deltas)

        average_loss += loss

        if step == 0:
            step_dev = 0.1 * 5
        else:
            step_dev = step

        current_avg_loss = np.mean(average_loss) / step_dev
        average_loss_arr.append(current_avg_loss)

        if step % 200 == 0:
            # if step > 0:
            # average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, current_avg_loss))
            # TODO: turns this back on if not sure how average_loss influences training process
            print("Embedding: ")
            em_val = tf.reduce_mean(tf.abs(embeddings))
            print(sess.run(em_val))
            # average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        check_step = (FLAGS.NUM_STEPS * 0.2)
        if step % check_step == 0:
            # gw_emb = tf.reduce_mean(tf.abs(gw_Embeddings))
            # print("Embedding gradients: ")
            # print(sess.run(gw_emb))

            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log = '%s %s,' % (log, close_word)
            print(log)

        current_saving_dir = os.path.join(
            FLAGS.trained_models,
            "_%sepoch" % (saving_pointers[saving_pointer_idx]))
        # EARLY STOPPING
        if min_loss >= current_avg_loss:
            min_loss = current_avg_loss
            per_dec_count = 0

            if FLAGS.save_best_model_alltime:
                best_of_saving_point_dir = os.path.join(
                    current_saving_dir, "_best_one")
                if not os.path.exists(best_of_saving_point_dir):
                    os.makedirs(best_of_saving_point_dir)

                temp_embeddings = normalized_embeddings.eval()
                spent_eps_deltas = priv_accountant.get_privacy_spent(
                    sess, target_eps=target_eps)
                saving_state(best_of_saving_point_dir, spent_eps_deltas,
                             temp_embeddings, saver, sess)
            msg = ("Got best model so far at step %s , avg loss = %s" %
                   (step, current_avg_loss))
            logging.info(msg)
            print(msg)
        else:
            per_dec_count += 1

        step += 1

        if per_dec_count == max_early_stopping or step == num_steps:
            running = False

        if (step + 1) in saving_pointers:
            spent_eps_deltas = priv_accountant.get_privacy_spent(
                sess, target_eps=target_eps)
            folder_path = os.path.join(FLAGS.trained_models,
                                       "_%sepoch" % (step + 1))
            temp_embeddings = normalized_embeddings.eval()
            saving_state(folder_path, spent_eps_deltas, temp_embeddings, saver,
                         sess)
            # Make sure we don't increase saving_pointer_idx larger than what the total number of pointers we set.
            if saving_pointer_idx < len(saving_pointers) - 1:
                saving_pointer_idx += 1
            msg = "##INFO: STEP %s: avg_loss history: avg_loss_arr = %s" % (
                step, average_loss_arr)
            logging.info(msg)

        if step % (num_steps - 1) == 0:
            print("Final privacy spent: ", step, spent_eps_deltas)

    print("Stopped at %s, \nFinal avg_loss = %s" % (step, avg_loss_arr))
    print("loss = %s" % (loss_arr))

    # final_embeddings = normalized_embeddings.eval()
    sess.close()