Beispiel #1
0
def move_to_click(x, y):
    t('绝对移动 点击')
    move_to(x, y)
    time.sleep(random.random() / 21)
    km._left_button_down()
    time.sleep(random.random() / 5)
    km._left_button_up()
Beispiel #2
0
def loss_and_grad(Wf):
    """Returns cost, gradient for current parameter vector."""
    global fs, X, global_cov_A, global_whitened_A

    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    W.insert(0, X)

    A = [None] * (n + 2)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = nonlin(W[i] @ A[i])
    err = (A[n + 1] - A[1])

    B = [None] * (n + 1)

    B[n] = 2 * err * d_nonlin(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        B[i] = backprop * d_nonlin(A[i + 1])

    dW = [None] * (n + 1)

    for i in range(1, n + 1):
        dW[i] = (B[i] @ t(A[i]))

    loss = u.L2(err)
    grad = u.flatten(dW[1:])
    return loss, grad
Beispiel #3
0
def move_rel_click(x, y):
    t('相对移动 点击')
    move_rel(x, y)
    time.sleep(random.random() / 20)
    km._left_button_down()
    time.sleep(random.random() / 5)
    km._left_button_up()
Beispiel #4
0
def model_fit():
    t('开始训练')

    global model, epochs, train_data_gen, total_train, batch_size, val_data_gen, total_val

    history = model.fit_generator(train_data_gen,
                                  steps_per_epoch=total_train // batch_size,
                                  epochs=epochs,
                                  validation_data=val_data_gen,
                                  validation_steps=total_val // batch_size)
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs_range = range(epochs)

    plt.figure(figsize=(8, 8))
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()
Beispiel #5
0
  def correct(self, grad):
    """Accepts IndexedGrad object, produces corrected version."""
    s = self

    vars_ = []
    grads_new = []

    assert list(grad) == self.model.trainable_vars

    dsize = get_batch_size(grad)
    
    for var in grad:
      vars_.append(var)
      A = s.extract_A(grad, var)    # extract activations
      B = s.extract_B(grad, var)*dsize    # extract backprops
      if s.needs_correction(var):
        # correct the gradient. Assume op is left matmul
        A_svd = s[var].A.svd
        B2_svd = s[var].B2.svd
        if inverse_method == 'pseudo_inverse':
          A_new = u.pseudo_inverse2(A_svd) @ A
          B_new = u.pseudo_inverse2(B2_svd) @ B
        elif inverse_method == 'inverse':
          A_new = A_svd.inv @ A
          B_new = B2_svd.inv @ B
        else:
          assert False
          
        dW_new = (B_new @ t(A_new)) / dsize
        grads_new.append(dW_new)
      else:  
        dW = B@t(A)/dsize   
        grads_new.append(dW)

    return IndexedGrad(grads=grads_new, vars_=vars_)
Beispiel #6
0
def model_predict(paths):
    t('模型预测')
    global model
    imgs = [load_and_preprocess_image(path) for path in paths]
    imgs = tf.convert_to_tensor(imgs)
    predictions = model.predict(imgs)
    predictions = [row[0] for row in predictions]
    print(predictions)
    min_index = predictions.index(min(predictions))
    print(f' 预测结果为 第 > {min_index + 1} < 张图片')
    return min_index
Beispiel #7
0
def count():
    t('统计')
    global total_train, total_val, epochs
    total_train = sumNum(c.train_dir)
    total_val = sumNum(c.validation_dir)
    epochs = math.ceil(total_train / batch_size)
    print('训练集标签 :', os.listdir(c.train_dir))
    print('训练集图片个数 :', total_train)
    print("验证集个数 :", total_val)
    print(f'每批次训练个数: {batch_size}, 共要进行 {epochs} 轮训练')
    if total_train == 0:
        print('样本为0 无法训练')
        return False
    return True
def natural_kfac(lr0, num_samples=1):
  init_dict[lr_holder] = lr0
  np.random.seed(0)
  tf.set_random_seed(0)

  A = [0]*(n+2)
  A2 = [0]*(n+2)  # augmented forward props for natural gradient
  A[0] = u.Identity(dsize)
  A2[0] =  u.Identity(dsize*num_samples)
  for i in range(n+1):
    # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))
    if i == 0:
      A2[i+1] = tf.concat([W[0]]*num_samples, axis=1)
    else:
      A2[i+1] = tf.matmul(W[i], A2[i], name="A2"+str(i+1))

  # create backprop matrices
  # B[i] has backprop for matrix i
  B = [0]*(n+1)
  B2 = [0]*(n+1)
  B[n] = -err/dsize
  B2[n] = tf.random_normal((f(n), dsize*num_samples), 0, 1, seed=0,
                           dtype=dtype)
  for i in range(n-1, -1, -1):
    B[i] = tf.matmul(tf.transpose(W[i+1]), B[i+1], name="B"+str(i))
    B2[i] = tf.matmul(tf.transpose(W[i+1]), B2[i+1], name="B2"+str(i))

  # Kronecker factored covariance blocks
  iblocks = u.empty_grid(n+1, n+1)
  for i in range(1, n+1):
    for j in range(1, n+1):
      if i == j:
        acov = A2[i] @ t(A2[j]) / (dsize*num_samples)
        bcov = B2[i] @ t(B2[j]) / (dsize*num_samples);
        term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
      else:
        term = tf.zeros(shape=(f(i)*f(i-1), f(j)*f(j-1)), dtype=dtype)
      iblocks[i][j]=term
      
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del iblocks[0]
  for row in iblocks:
    del row[0]

  ifisher = u.concat_blocks(iblocks)
  train_op = grad_update(Wf - lr * ifisher @ dWf)
  return do_run(train_op)
Beispiel #9
0
  def __init__(self, data, var, prefix, Lambda):
    global numeric_inverse
    
    dsize = get_batch_size(data)
    # TODO: try adding regularizer later
    cov_op = data @ t(data) / dsize
    if regularize_covariances:
      #ii = u.Identity(cov_op.shape[0])
      #regularizer = Lambda*ii
      regularizer = u.cachedGpuIdentityRegularizer(cov_op.shape[0],
                                                   Lambda=u.args.Lambda)
      cov_op = cov_op + regularizer
      
    cov_name = "%s_cov_%s" %(prefix, var.op.name)
    svd_name = "%s_svd_%s" %(prefix, var.op.name)

    # hack
    # pp = u.args.kfac_polyak_factor
    pp = 1
    if pp<1:
      self.cov = u.get_variable(name=cov_name, initializer=ii)
    else:
      self.cov = u.get_variable(name=cov_name, initializer=cov_op)
      
    self.svd = u.SvdWrapper(target=self.cov, name=svd_name,
                            do_inverses=(inverse_method=='inverse'))
    #    self.cov_update_op = self.cov.initializer
    if pp<1:
      self.cov_update_op = self.cov.assign(cov_op*(1-pp)+self.cov*pp).op
    else:
      self.cov_update_op = self.cov.assign(cov_op).op
Beispiel #10
0
 def hessian_quadratic(delta):
   #    update_covariances()
   W = u.unflatten(delta, fs[1:])
   W.insert(0, None)
   total = 0
   for l in range(1, n+1):
     decrement = tf.trace(t(W[l])@cov_B2[l]@W[l]@cov_A[l])
     total+=decrement
   return (total/2).eval()
def loss_and_grad(Wf):
    """Returns cost, gradient for current parameter vector."""
    global fs, X, global_cov_A, global_whitened_A

    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    W.insert(0, X)

    A = [None] * (n + 2)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = tf.sigmoid(W[i] @ A[i])
    err = (A[3] - A[1])

    def d_sigmoid(y):
        return y * (1 - y)

    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    sampled_labels = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    B2[n] = sampled_labels * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        B[i] = backprop * d_sigmoid(A[i + 1])
        B2[i] = backprop2 * d_sigmoid(A[i + 1])

    dW = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    whitened_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)

    if global_cov_A is None:
        global_cov_A = A[1] @ t(A[1]) / dsize
        global_whitened_A = regularized_inverse(global_cov_A, lambda_) @ A[1]

    cov_A[1] = global_cov_A
    whitened_A[1] = global_whitened_A

    for i in range(1, n + 1):
        if i > 1:
            cov_A[i] = A[i] @ t(A[i]) / dsize
            whitened_A[i] = regularized_inverse(cov_A[i], lambda_) @ A[i]
        cov_B2[i] = B2[i] @ t(B2[i]) / dsize
        whitened_B = regularized_inverse(cov_B2[i], lambda_) @ B[i]
        pre_dW[i] = (whitened_B @ t(whitened_A[i])) / dsize
        dW[i] = (B[i] @ t(A[i])) / dsize

    reconstruction = u.L2(err) / (2 * dsize)
    loss = reconstruction

    grad = u.flatten(dW[1:])
    kfac_grad = u.flatten(pre_dW[1:])
    return loss, grad, kfac_grad
Beispiel #12
0
def data_generator():
    t('数据生成器')
    global train_data_gen, val_data_gen

    train_image_generator = ImageDataGenerator(
        rescale=1. / 255)  # Generator for our training data
    validation_image_generator = ImageDataGenerator(
        rescale=1. / 255)  # Generator for our validation data

    train_data_gen = train_image_generator.flow_from_directory(
        batch_size=batch_size,
        directory=c.train_dir,
        shuffle=True,
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        class_mode='binary')
    val_data_gen = validation_image_generator.flow_from_directory(
        batch_size=batch_size,
        directory=c.validation_dir,
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        class_mode='binary')
Beispiel #13
0
 def hessian_quadratic_inv(delta):
   #    update_covariances()
   W = u.unflatten(delta, fs[1:])
   W.insert(0, None)
   total = 0
   for l in range(1, n+1):
     invB2 = u.pseudo_inverse2(vars_svd_B2[l])
     invA = u.pseudo_inverse2(vars_svd_A[l])
     decrement = tf.trace(t(W[l])@invB2@W[l]@invA)
     total+=decrement
   return (total/2).eval()
def newton_bd(lr0):
  init_dict[lr_holder] = lr0

  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]

  # Create U's
  U = [list(range(n+1)) for _ in range(n+1)]
  for bottom in range(n+1):
    for top in range(n+1):
      if bottom > top:
        prod = u.Identity(f(top))
      else:
        prod = u.Identity(f(bottom-1))
        for i in range(bottom, top+1):
          prod = prod@t(W[i])
      U[bottom][top] = prod

  # Block i, j gives hessian block between layer i and layer j
  blocks = [list(range(n+1)) for _ in range(n+1)]
  for i in range(1, n+1):
    for j in range(1, n+1):
      term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype)
      elif i < j:
        term2 = kr(A[i] @ t(B[j]), U[i+1][j-1])
      else:
        term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j]))
        
      blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1))

        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del blocks[0]
  for row in blocks:
    del row[0]

  # todo -- figure out why this is not the same as block inversion
  # grads = tf.concat([u.khatri_rao(A[i], Bn[i]) for i in range(1, n+1)], axis=0)
  # hess = grads @ tf.transpose(grads) / dsize
  # blocks = u.partition_matrix_evenly(hess, 10)
  ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))
  train_op = grad_update(Wf - lr * ihess @ dWf)
  return do_run(train_op)
Beispiel #15
0
def model_summary():
    t('模型编译统计')
    global model
    model = Sequential([
        Conv2D(16, (3, 3),
               padding='same',
               activation='relu',
               input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
        MaxPooling2D((2, 2)),
        Conv2D(32, (3, 3), padding='same', activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), padding='same', activation='relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam',
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    model.summary()
def newton_kfac(lr0):
  init_dict[lr_holder] = lr0

  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]
    
  # inverse Hessian blocks
  iblocks = u.empty_grid(n+1, n+1)
  for i in range(1, n+1):
    for j in range(1, n+1):
      # reuse Hess tensor calculation in order to get off-diag block sizes
      dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        acov = A[i] @ t(A[j])
        bcov = (Bn[i] @ t(Bn[j]))/dsize
        term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
      else:
        term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
      iblocks[i][j]=term
        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del iblocks[0]
  for row in iblocks:
    del row[0]
    
  ihess = u.concat_blocks(iblocks)
  
  train_op = grad_update(Wf - lr * ihess @ dWf)
  return do_run(train_op)
def newton(lr0):
  init_dict[lr_holder] = lr0

  # todo, get rid of B's
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]
    
  # Create U's
  U = [list(range(n+1)) for _ in range(n+1)]
  for bottom in range(n+1):
    for top in range(n+1):
      if bottom > top:
        prod = u.Identity(f(top))
      else:
        prod = u.Identity(f(bottom-1))
        for i in range(bottom, top+1):
          prod = prod@t(W[i])
      U[bottom][top] = prod

    # Block i, j gives hessian block between layer i and layer j
  blocks = [list(range(n+1)) for _ in range(n+1)]
  for i in range(1, n+1):
    for j in range(1, n+1):
      term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype)
      elif i < j:
        term2 = kr(A[i] @ t(B[j]), U[i+1][j-1])
      else:
        term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j]))
        
      blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1))

        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del blocks[0]
  for row in blocks:
    del row[0]
    
  hess = u.concat_blocks(blocks)
  ihess = u.pseudo_inverse(hess)
  train_op = grad_update(Wf - lr * ihess @ dWf)
  return do_run(train_op)
Beispiel #18
0
def rotations2_natural_sampled_kfac(num_samples=1):
    tf.reset_default_graph()
    np.random.seed(0)
    tf.set_random_seed(0)

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    # initialize data + layers
    # W[0] is input matrix (X), W[n] is last matrix
    # A[1] has activations for W[1], equal to W[0]=X
    # A[n+1] has predictions
    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)

    A = [0] * (n + 2)
    A2 = [0] * (n + 2)  # augmented forward props for natural gradient
    A[0] = u.Identity(dsize)
    A2[0] = u.Identity(dsize * num_samples)
    for i in range(n + 1):
        # fs is off by 2 from common notation, ie W[0] has shape f[0],f[-1]
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))
        if i == 0:
            # replicate dataset multiple times corresponding to number of samples
            A2[i + 1] = tf.concat([W[0]] * num_samples, axis=1)
        else:
            A2[i + 1] = tf.matmul(W[i], A2[i], name="A2" + str(i + 1))

    # input dimensions match
    assert W[0].get_shape() == X0.shape
    # output dimensions match
    assert W[-1].get_shape()[0], W[0].get_shape()[1] == Y0.shape
    assert A[n + 1].get_shape() == Y0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)

    # lower learning rate by 10x
    lr = tf.Variable(0.01, dtype=dtype)

    # create backprop matrices
    # B[i] has backprop for matrix i
    B = [0] * (n + 1)
    B2 = [0] * (n + 1)
    B[n] = -err / dsize
    B2[n] = tf.random_normal((f(n), dsize * num_samples),
                             0,
                             1,
                             seed=0,
                             dtype=dtype)
    for i in range(n - 1, -1, -1):
        B[i] = tf.matmul(tf.transpose(W[i + 1]), B[i + 1], name="B" + str(i))
        B2[i] = tf.matmul(tf.transpose(W[i + 1]),
                          B2[i + 1],
                          name="B2" + str(i))

    # Create gradient update. Make copy of variables and split update into
    # two run calls. Using single set of variables will gives updates that
    # occasionally produce wrong results/NaN's because of data race

    dW = [0] * (n + 1)
    dW2 = [0] * (n + 1)
    updates1 = [0] * (n + 1)  # compute updated value into Wcopy
    updates2 = [0] * (n + 1)  # copy value back into W
    Wcopy = [0] * (n + 1)
    for i in range(n + 1):
        Wi_name = "Wcopy" + str(i)
        Wi_shape = (fs[i + 1], fs[i])
        Wi_init = tf.zeros(dtype=dtype, shape=Wi_shape, name=Wi_name + "_init")
        Wcopy[i] = tf.Variable(Wi_init, name=Wi_name, trainable=False)

        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
        dW2[i] = tf.matmul(B2[i], tf.transpose(A2[i]), name="dW2" + str(i))

    del dW[0]  # get rid of W[0] update
    del dW2[0]  # get rid of W[0] update

    # construct flattened gradient update vector
    dWf = tf.concat([vec(grad) for grad in dW], axis=0)

    # todo: divide both activations and backprops by size for cov calc

    # Kronecker factored covariance blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            if i == j:
                acov = A2[i] @ t(A2[j]) / (dsize * num_samples)
                bcov = B2[i] @ t(B2[j]) / (dsize * num_samples)
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=(f(i) * f(i - 1), f(j) * f(j - 1)),
                                dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ifisher = u.concat_blocks(iblocks)

    Wf_copy = tf.Variable(tf.zeros(dtype=dtype,
                                   shape=Wf.shape,
                                   name="Wf_copy_init"),
                          name="Wf_copy")
    new_val_matrix = Wf - lr * (ifisher @ dWf)
    train_op1 = Wf_copy.assign(new_val_matrix)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run(loss)
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Beispiel #19
0
def loss_and_output_and_grad(Wf):
    """Returns cost, gradient for current parameter vector."""
    global fs, X, global_cov_A, global_whitened_A

    W = u.unflatten(Wf, fs[1:])  # perftodo: this creates transposes
    W.insert(0, X)

    A = [None] * (n + 2)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = nonlin(W[i] @ A[i])


#    print(A[i+1])
#    #    print(A[i+1])
    err = (A[n + 1] - A[1])

    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_nonlin(A[n + 1])
    #  sampled_labels = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)

    #  print('random')
    #  print(np.random.randn(*X.shape).astype(dtype))
    noise = tf.constant(np.random.randn(*err.shape).astype(dtype))
    #  print(noise)
    B2[n] = noise * d_nonlin(A[n + 1])
    #  print("B2[n]", B2[n])
    #  print(B[n])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        B[i] = backprop * d_nonlin(A[i + 1])
        B2[i] = backprop2 * d_nonlin(A[i + 1])

    dW = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    whitened_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    cov_B = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)

    if global_cov_A is None:
        global_cov_A = A[1] @ t(A[1]) / dsize
        global_whitened_A = regularized_inverse(global_cov_A) @ A[1]

    cov_A[1] = global_cov_A
    whitened_A[1] = global_whitened_A

    for i in range(1, n + 1):
        if i > 1:
            cov_A[i] = A[i] @ t(A[i]) / dsize
            whitened_A[i] = regularized_inverse(cov_A[i]) @ A[i]
        cov_B2[i] = B2[i] @ t(B2[i]) / dsize
        cov_B[i] = B[i] @ t(B[i]) / dsize
        if SYNTHETIC_LABELS:
            whitened_B = regularized_inverse(cov_B2[i]) @ B[i]
        else:
            whitened_B = regularized_inverse(cov_B[i]) @ B[i]

        #regularized_inverse(cov_B[i])
        #    print("A", i, cov_A[i], regularized_inverse(cov_A[i]))
        #    print("B", i, cov_B[i], regularized_inverse(cov_B[i]))

        #    pre_dW[i] = (whitened_B @ t(whitened_A[i]))/dsize
        #    print(i, 'A', A[i].numpy())
        #    print(regularized_inverse(cov_A[i]).numpy())
        pre_dW[i] = (whitened_B @ t(whitened_A[i])) / dsize

        dW[i] = (B[i] @ t(A[i])) / dsize

    loss = u.L2(err) / 2 / dsize
    grad = u.flatten(dW[1:])
    kfac_grad = u.flatten(pre_dW[1:])
    return loss, A[n + 1], grad, kfac_grad
Beispiel #20
0
  err = (A[3] - A[1])
  rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True)/dsize

  # B[i] = backprops needed to compute gradient of W[i]
  # B2[i] = backprops from sampled labels needed for natural gradient
  B = [None]*(n+1)
  B2 = [None]*(n+1)
  B[n] = err*d_sigmoid(A[n+1])
  sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
  if use_fixed_labels:
    sampled_labels_live = tf.ones(shape=(f(n), f(-1)), dtype=dtype)
    
  sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True)
  B2[n] = sampled_labels*d_sigmoid(A[n+1])
  for i in range(n-1, -1, -1):
    backprop = t(W[i+1]) @ B[i+1]
    backprop2 = t(W[i+1]) @ B2[i+1]
    if i == 1 and not drop_sparsity:
      backprop += beta*d_kl(rho, rho_hat)
      backprop2 += beta*d_kl(rho, rho_hat)
    B[i] = backprop*d_sigmoid(A[i+1])
    B2[i] = backprop2*d_sigmoid(A[i+1])

  # dW[i] = gradient of W[i]
  dW = [None]*(n+1)
  pre_dW = [None]*(n+1)  # preconditioned dW
  pre_dW_stable = [None]*(n+1)  # preconditioned stable dW

  cov_A = [None]*(n+1)    # covariance of activations[i]
  cov_B2 = [None]*(n+1)   # covariance of synthetic backprops[i]
  vars_svd_A = [None]*(n+1)
Beispiel #21
0
def rotations1_gradient_test():
    #  https://www.wolframcloud.com/objects/ff6ecaf0-fccd-44e3-b26f-970d8fc2a57c
    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations1_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations1_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations1_W0f.csv', delimiter=","))

    fs = np.genfromtxt('data/large_rotations1_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr0 = np.genfromtxt('data/large_rotations1_gradient_lr.csv')
    lr = tf.Variable(lr0, dtype=dtype)

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    expected_losses = np.loadtxt("data/large_rotations1_gradient_losses.csv",
                                 delimiter=",")
    observed_losses = []
    # from accompanying notebook
    # {0.102522, 0.028124, 0.00907214, 0.00418929, 0.00293379,
    for i in range(10):
        observed_losses.append(sess.run([loss])[0])
        sess.run(train_op1)
        sess.run(train_op2)

    u.check_equal(observed_losses, expected_losses)
Beispiel #22
0
def model_creator(batch_size, name="default", dtype=np.float32):
    """Create MNIST autoencoder model. Dataset is part of model."""

    model = Model(name)

    def get_batch_size(data):
        if isinstance(data, IndexedGrad):
            return int(data.live[0].shape[1])
        else:
            return int(data.shape[1])

    init_dict = {}
    global_vars = []
    local_vars = []

    # TODO: factor out to reuse between scripts
    # TODO: change feed_dict logic to reuse value provided to VarStruct
    # current situation makes reinitialization of global variable change
    # it's value, counterinituitive
    def init_var(val, name, is_global=False):
        """Helper to create variables with numpy or TF initial values."""
        if isinstance(val, tf.Tensor):
            var = u.get_variable(name=name, initializer=val, reuse=is_global)
        else:
            val = np.array(val)
            assert u.is_numeric(val), "Non-numeric type."

            var_struct = u.get_var(name=name, initializer=val, reuse=is_global)
            holder = var_struct.val_
            init_dict[holder] = val
            var = var_struct.var

        if is_global:
            global_vars.append(var)
        else:
            local_vars.append(var)

        return var

    # TODO: get rid of purely_relu
    def nonlin(x):
        if purely_relu:
            return tf.nn.relu(x)
        elif purely_linear:
            return tf.identity(x)
        else:
            return tf.sigmoid(x)

    # TODO: rename into "nonlin_d"
    def d_nonlin(y):
        if purely_relu:
            return u.relu_mask(y)
        elif purely_linear:
            return 1
        else:
            return y * (1 - y)

    patches = train_images[:, :args.batch_size]
    test_patches = test_images[:, :args.batch_size]

    if args.dataset == 'cifar':
        input_dim = 3 * 32 * 32
    elif args.dataset == 'mnist':
        input_dim = 28 * 28
    else:
        assert False
    if release_name == 'kfac_tiny':
        fs = [args.batch_size, input_dim, 196, input_dim]
    else:
        fs = [
            args.batch_size, input_dim, 1024, 1024, 1024, 196, 1024, 1024,
            1024, input_dim
        ]

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    n = len(fs) - 2

    # Full dataset from which new batches are sampled
    X_full = init_var(train_images, "X_full", is_global=True)

    X = init_var(patches, "X", is_global=False)  # stores local batch per model
    W = [None] * n
    W.insert(0, X)
    A = [None] * (n + 2)
    A[1] = W[0]
    for i in range(1, n + 1):
        init_val = ng_init(f(i), f(i - 1)).astype(dtype)
        W[i] = init_var(init_val, "W_%d" % (i, ), is_global=True)
        A[i + 1] = nonlin(kfac_lib.matmul(W[i], A[i]))
    err = A[n + 1] - A[1]
    model.loss = u.L2(err) / (2 * get_batch_size(err))

    # create test error eval
    layer0 = init_var(test_patches, "X_test", is_global=True)
    layer = layer0
    for i in range(1, n + 1):
        layer = nonlin(W[i] @ layer)
    verr = (layer - layer0)
    model.vloss = u.L2(verr) / (2 * get_batch_size(verr))

    # manually compute backprop to use for sanity checking
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_nonlin(A[n + 1])
    _sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    if args.fixed_labels:
        _sampled_labels_live = tf.ones(shape=(f(n), f(-1)), dtype=dtype)

    _sampled_labels = init_var(_sampled_labels_live,
                               "to_be_deleted",
                               is_global=False)

    B2[n] = _sampled_labels * d_nonlin(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        B[i] = backprop * d_nonlin(A[i + 1])
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        B2[i] = backprop2 * d_nonlin(A[i + 1])

    # cov_A = [None]*(n+1)    # covariance of activations[i]
    # cov_B2 = [None]*(n+1)   # covariance of synthetic backprops[i]


#  vars_svd_A = [None]*(n+1)
#  vars_svd_B2 = [None]*(n+1)
#  dW = [None]*(n+1)
#  pre_dW = [None]*(n+1)   # preconditioned dW
# todo: decouple initial value from covariance update
# # maybe need start with identity and do running average
# for i in range(1,n+1):
#   if regularized_svd:
#     cov_A[i] = init_var(A[i]@t(A[i])/args.batch_size+args.Lambda*u.Identity(f(i-1)), "cov_A%d"%(i,))
#     cov_B2[i] = init_var(B2[i]@t(B2[i])/args.batch_size+args.Lambda*u.Identity(f(i)), "cov_B2%d"%(i,))
#   else:
#     cov_A[i] = init_var(A[i]@t(A[i])/args.batch_size, "cov_A%d"%(i,))
#     cov_B2[i] = init_var(B2[i]@t(B2[i])/args.batch_size, "cov_B2%d"%(i,))
#    vars_svd_A[i] = u.SvdWrapper(cov_A[i],"svd_A_%d"%(i,), do_inverses=False)
#    vars_svd_B2[i] = u.SvdWrapper(cov_B2[i],"svd_B2_%d"%(i,), do_inverses=False)

#    whitened_A = u.cached_inverse(vars_svd_A[i], args.Lambda) @ A[i]
#    whitened_B = u.cached_inverse(vars_svd_B2[i], args.Lambda) @ B[i]
#    dW[i] = (B[i] @ t(A[i]))/args.batch_size
#    pre_dW[i] = (whitened_B @ t(whitened_A))/args.batch_size

    sampled_labels_live = A[n + 1] + tf.random_normal(
        (f(n), f(-1)), dtype=dtype, seed=0)
    if args.fixed_labels:
        sampled_labels_live = A[n + 1] + tf.ones(shape=(f(n), f(-1)),
                                                 dtype=dtype)
    sampled_labels = init_var(sampled_labels_live,
                              "sampled_labels",
                              is_global=False)
    err2 = A[n + 1] - sampled_labels
    model.loss2 = u.L2(err2) / (2 * args.batch_size)
    model.global_vars = global_vars
    model.local_vars = local_vars
    model.trainable_vars = W[1:]

    # todo, we have 3 places where model step is tracked, reduce
    model.step = init_var(u.as_int32(0), "step", is_global=False)
    advance_step_op = model.step.assign_add(1)
    assert get_batch_size(X_full) % args.batch_size == 0
    batches_per_dataset = (get_batch_size(X_full) // args.batch_size)
    batch_idx = tf.mod(model.step, batches_per_dataset)
    start_idx = batch_idx * args.batch_size
    advance_batch_op = X.assign(X_full[:,
                                       start_idx:start_idx + args.batch_size])

    def advance_batch():
        #    print("Step for model(%s) is %s"%(model.name, u.eval(model.step)))
        sess = u.get_default_session()
        # TODO: get rid of _sampled_labels
        sessrun([sampled_labels.initializer, _sampled_labels.initializer])
        if args.advance_batch:
            sessrun(advance_batch_op)
        sessrun(advance_step_op)

    model.advance_batch = advance_batch

    # TODO: refactor this to take initial values out of Var struct
    #global_init_op = tf.group(*[v.initializer for v in global_vars])
    global_init_ops = [v.initializer for v in global_vars]
    global_init_op = tf.group(*[v.initializer for v in global_vars])
    global_init_query_ops = [
        tf.logical_not(tf.is_variable_initialized(v)) for v in global_vars
    ]

    def initialize_global_vars(verbose=False, reinitialize=False):
        """If reinitialize is false, will not reinitialize variables already
    initialized."""

        sess = u.get_default_session()
        if not reinitialize:
            uninited = sessrun(global_init_query_ops)
            # use numpy boolean indexing to select list of initializers to run
            to_initialize = list(np.asarray(global_init_ops)[uninited])
        else:
            to_initialize = global_init_ops

        if verbose:
            print("Initializing following:")
            for v in to_initialize:
                print("   " + v.name)

        sessrun(to_initialize, feed_dict=init_dict)

    model.initialize_global_vars = initialize_global_vars

    # didn't quite work (can't initialize var in same run call as deps likely)
    # enforce that batch is initialized before everything
    # except fake labels opa
    # for v in local_vars:
    #   if v != X and v != sampled_labels and v != _sampled_labels:
    #     print("Adding dep %s on %s"%(v.initializer.name, X.initializer.name))
    #     u.add_dep(v.initializer, on_op=X.initializer)

    local_init_op = tf.group(*[v.initializer for v in local_vars],
                             name="%s_localinit" % (model.name))
    print("Local vars:")
    for v in local_vars:
        print(v.name)

    def initialize_local_vars():
        sess = u.get_default_session()
        sessrun(_sampled_labels.initializer, feed_dict=init_dict)
        sessrun(X.initializer, feed_dict=init_dict)
        sessrun(local_init_op, feed_dict=init_dict)

    model.initialize_local_vars = initialize_local_vars

    return model
Beispiel #23
0
def rotations2_newton_kfac():
    tf.reset_default_graph()

    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # inverse Hessian blocks
    iblocks = u.empty_grid(n + 1, n + 1)
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            # reuse Hess tensor calculation in order to get off-diag block sizes
            dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                acov = A[i] @ t(A[j])
                bcov = (Bn[i] @ t(Bn[j])) / dsize
                term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
            else:
                term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
            iblocks[i][j] = term

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del iblocks[0]
    for row in iblocks:
        del row[0]

    ihess = u.concat_blocks(iblocks)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    elapsed_times = []
    u.reset_time()
    for i in range(10):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Beispiel #24
0
def rotations2_newton_bd():
    # override kr with no-shape-inferring version
    def kr(A, B):
        return u.kronecker(A, B, do_shape_inference=False)

    tf.reset_default_graph()
    X0 = np.genfromtxt('data/large_rotations2_X0.csv', delimiter=",")
    Y0 = np.genfromtxt('data/large_rotations2_Y0.csv', delimiter=",")
    W0f = v2c_np(np.genfromtxt('data/large_rotations2_W0f.csv', delimiter=","))
    fs = np.genfromtxt('data/large_rotations2_fs.csv',
                       delimiter=",").astype(np.int32)
    n = len(fs) - 2  # number of layers

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = X0.shape[1]
    assert f(-1) == dsize

    # load W0f and do shape checks (can remove)
    W0s = u.unflatten_np(W0f,
                         fs[1:])  # Wf doesn't have first layer (data matrix)
    W0s.insert(0, X0)
    Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
    Wf = tf.Variable(Wf_holder, name="Wf")
    Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
    init_dict = {Wf_holder: W0f}

    # Create W's
    W = u.unflatten(Wf, fs[1:])
    X = tf.constant(X0)
    Y = tf.constant(Y0)
    W.insert(0, X)
    for (numpy_W, tf_W) in zip(W0s, W):
        u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

    # Create A's
    # A[1] == X
    A = [0] * (n + 2)
    A[0] = u.Identity(dsize)
    for i in range(n + 1):
        A[i + 1] = tf.matmul(W[i], A[i], name="A" + str(i + 1))

    assert W[0].get_shape() == X0.shape
    assert A[n + 1].get_shape() == X0.shape
    assert A[1].get_shape() == X0.shape

    err = Y - A[n + 1]
    loss = tf.reduce_sum(tf.square(err)) / (2 * dsize)
    lr = tf.Variable(0.1, dtype=dtype, name="learning_rate")

    # Create B's
    B = [0] * (n + 1)
    B[n] = -err / dsize
    Bn = [0] * (n + 1)  # Newton-modified backprop
    Bn[n] = u.Identity(f(n))
    for i in range(n - 1, -1, -1):
        B[i] = t(W[i + 1]) @ B[i + 1]
        Bn[i] = t(W[i + 1]) @ Bn[i + 1]

    # Create U's
    U = [list(range(n + 1)) for _ in range(n + 1)]
    for bottom in range(n + 1):
        for top in range(n + 1):
            if bottom > top:
                prod = u.Identity(f(top))
            else:
                prod = u.Identity(f(bottom - 1))
                for i in range(bottom, top + 1):
                    prod = prod @ t(W[i])
            U[bottom][top] = prod

    # Block i, j gives hessian block between layer i and layer j
    blocks = [list(range(n + 1)) for _ in range(n + 1)]
    for i in range(1, n + 1):
        for j in range(1, n + 1):
            term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize
            if i == j:
                term2 = tf.zeros((f(i) * f(i - 1), f(i) * f(i - 1)),
                                 dtype=dtype)
            elif i < j:
                term2 = kr(A[i] @ t(B[j]), U[i + 1][j - 1])
            else:
                term2 = kr(t(U[j + 1][i - 1]), B[i] @ t(A[j]))

            blocks[i][j] = term1 + term2 @ Kmat(f(j), f(j - 1))

    # remove leftmost blocks (those are with respect to W[0] which is input)
    del blocks[0]
    for row in blocks:
        del row[0]

    ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))

    sess = tf.Session()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

    # create dW's
    dW = [0] * (n + 1)
    for i in range(n + 1):
        dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW" + str(i))
    del dW[0]  # get rid of W[0] update

    dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
    Wf_new = Wf - lr * ihess @ dWf

    train_op1 = Wf_copy.assign(Wf_new)
    train_op2 = Wf.assign(Wf_copy)

    observed_losses = []
    u.reset_time()
    for i in range(20):
        loss0 = sess.run([loss])[0]
        print(loss0)
        observed_losses.append(loss0)
        sess.run(train_op1)
        sess.run(train_op2)
        u.record_time()

    u.summarize_time()
    u.summarize_graph()
Beispiel #25
0
def cost_and_grad(W0f=None,
                  fs=None,
                  lambda_=3e-3,
                  rho=0.1,
                  beta=3,
                  X0=None,
                  lr=0.1):
    """Construct sparse autoencoder loss and gradient.

  Args:
    W0f: initial value of weights (flattened representation)
    fs: list of sizes [dsize, visible, hidden, visible]
    sparsity_param: global feature sparsity target
    beta: weight on sparsity penalty
    X0: value of X (aka W[0])

  Returns:
    cost, train_step
  """

    np.random.seed(0)
    tf.set_random_seed(0)
    dtype = np.float32

    if not fs:
        fs = [dsize, 28 * 28, 196, 28 * 28]
    if not W0f:
        W0f = W_uniform(fs[2], fs[3])
    rho = tf.constant(rho, dtype=dtype)

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    dsize = f(-1)
    n = len(fs) - 2

    init_dict = {}

    def init_var(val, name, trainable=True):
        holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder")
        var = tf.Variable(holder, name=name + "_var", trainable=trainable)
        init_dict[holder] = val
        return var

    Wf = init_var(W0f, "Wf")
    Wf_copy = init_var(W0f, "Wf_copy")
    W = u.unflatten(Wf, fs[1:])
    X = init_var(X0, "X", False)
    W.insert(0, X)

    def sigmoid(x):
        return tf.sigmoid(x)

    def d_sigmoid(y):
        return y * (1 - y)

    def kl(x, y):
        return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y))

    def d_kl(x, y):
        return (1 - x) / (1 - y) - x / y

    # A[i] = activations needed to compute gradient of W[i]
    # A[n+1] = network output
    A = [None] * (n + 2)
    A[0] = u.Identity(dsize, dtype=dtype)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    B = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        if i == 1:
            backprop += beta * d_kl(rho, rho_hat)
        B[i] = backprop * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    for i in range(n + 1):
        dW[i] = (B[i] @ t(A[i])) / dsize

    # Cost function
    reconstruction = u.L2(err) / (2 * dsize)
    sparsity = beta * tf.reduce_sum(kl(rho, rho_hat))
    L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1]))
    cost = reconstruction + sparsity + L2

    grad = u.flatten(dW[1:])
    copy_op = Wf_copy.assign(Wf - lr * grad)
    with tf.control_dependencies([copy_op]):
        train_op = Wf.assign(Wf_copy)

    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
    return cost, train_op
Beispiel #26
0
def model_creator(batch_size, name='defaultmodel', dtype=np.float32):
    """Create MNIST autoencoder model. Dataset is part of model."""

    global hack_global_init_dict

    model = Model(name)

    # TODO: actually use batch_size
    init_dict = {}  # todo: rename to feed_dict?
    global_vars = []
    local_vars = []

    # TODO: rename to make_var
    def init_var(val, name, is_global=False):
        """Helper to create variables with numpy or TF initial values."""
        if isinstance(val, tf.Tensor):
            var = u.get_variable(name=name, initializer=val, reuse=is_global)
        else:
            val = np.array(val)
            assert u.is_numeric(val), "Non-numeric type."

            var_struct = u.get_var(name=name, initializer=val, reuse=is_global)
            holder = var_struct.val_
            init_dict[holder] = val
            var = var_struct.var

        if is_global:
            global_vars.append(var)
        else:
            local_vars.append(var)

        return var

    # TODO: get rid of purely_relu
    def nonlin(x):
        if purely_relu:
            return tf.nn.relu(x)
        elif purely_linear:
            return tf.identity(x)
        else:
            return tf.sigmoid(x)

    # TODO: rename into "nonlin_d"
    def d_nonlin(y):
        if purely_relu:
            return u.relu_mask(y)
        elif purely_linear:
            return 1
        else:
            return y * (1 - y)

    train_images = load_MNIST.load_MNIST_images(
        'data/train-images-idx3-ubyte').astype(dtype)
    patches = train_images[:, :batch_size]
    fs = [batch_size, 28 * 28, 196, 28 * 28]

    def f(i):
        return fs[i + 1]  # W[i] has shape f[i] x f[i-1]

    n = len(fs) - 2

    X = init_var(patches, "X", is_global=False)
    W = [None] * n
    W.insert(0, X)
    A = [None] * (n + 2)
    A[1] = W[0]
    W0f_old = W_uniform(fs[2],
                        fs[3]).astype(dtype)  # to match previous generation
    W0s_old = u.unflatten(W0f_old, fs[1:])  # perftodo: this creates transposes
    for i in range(1, n + 1):
        #    temp = init_var(ng_init(f(i), f(i-1)), "W_%d"%(i,), is_global=True)
        #    init_val1 = W0s_old[i-1]
        init_val = ng_init(f(i), f(i - 1)).astype(dtype)
        W[i] = init_var(init_val, "W_%d" % (i, ), is_global=True)
        A[i + 1] = nonlin(kfac_lib.matmul(W[i], A[i]))

    err = A[n + 1] - A[1]

    # manually compute backprop to use for sanity checking
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_nonlin(A[n + 1])
    _sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
    if use_fixed_labels:
        _sampled_labels_live = tf.ones(shape=(f(n), f(-1)), dtype=dtype)

    _sampled_labels = init_var(_sampled_labels_live,
                               "to_be_deleted",
                               is_global=False)

    B2[n] = _sampled_labels * d_nonlin(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        B[i] = backprop * d_nonlin(A[i + 1])
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        B2[i] = backprop2 * d_nonlin(A[i + 1])

    cov_A = [None] * (n + 1)  # covariance of activations[i]
    cov_B2 = [None] * (n + 1)  # covariance of synthetic backprops[i]
    vars_svd_A = [None] * (n + 1)
    vars_svd_B2 = [None] * (n + 1)
    dW = [None] * (n + 1)
    dW2 = [None] * (n + 1)
    pre_dW = [None] * (n + 1)  # preconditioned dW
    for i in range(1, n + 1):
        if regularized_svd:
            cov_A[i] = init_var(
                A[i] @ t(A[i]) / batch_size + LAMBDA * u.Identity(f(i - 1)),
                "cov_A%d" % (i, ))
            cov_B2[i] = init_var(
                B2[i] @ t(B2[i]) / batch_size + LAMBDA * u.Identity(f(i)),
                "cov_B2%d" % (i, ))
        else:
            cov_A[i] = init_var(A[i] @ t(A[i]) / batch_size, "cov_A%d" % (i, ))
            cov_B2[i] = init_var(B2[i] @ t(B2[i]) / batch_size,
                                 "cov_B2%d" % (i, ))
        vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, ))
        vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, ))
        if use_tikhonov:
            whitened_A = u.regularized_inverse3(vars_svd_A[i], L=LAMBDA) @ A[i]
            whitened_B2 = u.regularized_inverse3(vars_svd_B2[i],
                                                 L=LAMBDA) @ B[i]
        else:
            whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i]
            whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i]

        dW[i] = (B[i] @ t(A[i])) / batch_size
        dW2[i] = B[i] @ t(A[i])
        pre_dW[i] = (whitened_B2 @ t(whitened_A)) / batch_size

        #  model.extra['A'] = A
        #  model.extra['B'] = B
        #  model.extra['B2'] = B2
        #  model.extra['cov_A'] = cov_A
        #  model.extra['cov_B2'] = cov_B2
        #  model.extra['vars_svd_A'] = vars_svd_A
        #  model.extra['vars_svd_B2'] = vars_svd_B2
        #  model.extra['W'] = W
        #  model.extra['dW'] = dW
        #  model.extra['dW2'] = dW2
        #  model.extra['pre_dW'] = pre_dW

    model.loss = u.L2(err) / (2 * batch_size)
    sampled_labels_live = A[n + 1] + tf.random_normal(
        (f(n), f(-1)), dtype=dtype, seed=0)
    if use_fixed_labels:
        sampled_labels_live = A[n + 1] + tf.ones(shape=(f(n), f(-1)),
                                                 dtype=dtype)
    sampled_labels = init_var(sampled_labels_live,
                              "sampled_labels",
                              is_global=False)
    err2 = A[n + 1] - sampled_labels
    model.loss2 = u.L2(err2) / (2 * batch_size)
    model.global_vars = global_vars
    model.local_vars = local_vars
    model.trainable_vars = W[1:]

    def advance_batch():
        sess = tf.get_default_session()
        # TODO: get rid of _sampled_labels
        sess.run([sampled_labels.initializer, _sampled_labels.initializer])

    model.advance_batch = advance_batch

    global_init_op = tf.group(*[v.initializer for v in global_vars])

    def initialize_global_vars():
        sess = tf.get_default_session()
        sess.run(global_init_op, feed_dict=init_dict)

    model.initialize_global_vars = initialize_global_vars

    local_init_op = tf.group(*[v.initializer for v in local_vars])

    def initialize_local_vars():
        sess = tf.get_default_session()
        sess.run(X.initializer, feed_dict=init_dict)  # A's depend on X
        sess.run(_sampled_labels.initializer, feed_dict=init_dict)
        sess.run(local_init_op, feed_dict=init_dict)

    model.initialize_local_vars = initialize_local_vars

    hack_global_init_dict = init_dict

    return model
Beispiel #27
0
def simple_newton_kfac_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]
    
  # inverse Hessian blocks
  iblocks = u.empty_grid(n+1, n+1)
  for i in range(1, n+1):
    for j in range(1, n+1):
      # reuse Hess tensor calculation in order to get off-diag block sizes
      dummy_term = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        acov = A[i] @ t(A[j])
        bcov = Bn[i] @ t(Bn[j]) / dsize;
        term = kr(u.pseudo_inverse(acov), u.pseudo_inverse(bcov))
      else:
        term = tf.zeros(shape=dummy_term.get_shape(), dtype=dtype)
      iblocks[i][j]=term
        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del iblocks[0]
  for row in iblocks:
    del row[0]
    
  ihess = u.concat_blocks(iblocks)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonkfac_losses.csv",
                               delimiter= ",")
  observed_losses = []

  # from accompanying notebook
  #  {0.0111498, 0.0000171591, 4.11445*10^-11, 2.33653*10^-22, 
  # 6.88354*10^-33,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #28
0
def simple_newton_bd_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}
  
  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.5, dtype=dtype, name="learning_rate")
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  Bn = [0]*(n+1)            # Newton-modified backprop
  Bn[n] = u.Identity(f(n))
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    Bn[i] = t(W[i+1]) @ Bn[i+1]

  # Create U's
  U = [list(range(n+1)) for _ in range(n+1)]
  for bottom in range(n+1):
    for top in range(n+1):
      if bottom > top:
        prod = u.Identity(f(top))
      else:
        prod = u.Identity(f(bottom-1))
        for i in range(bottom, top+1):
          prod = prod@t(W[i])
      U[bottom][top] = prod

  # Block i, j gives hessian block between layer i and layer j
  blocks = [list(range(n+1)) for _ in range(n+1)]
  for i in range(1, n+1):
    for j in range(1, n+1):
      term1 = kr(A[i] @ t(A[j]), Bn[i] @ t(Bn[j])) / dsize;
      if i == j:
        term2 = tf.zeros((f(i)*f(i-1), f(i)*f(i-1)), dtype=dtype)
      elif i < j:
        term2 = kr(A[i] @ t(B[j]), U[i+1][j-1])
      else:
        term2 = kr(t(U[j+1][i-1]), B[i] @ t(A[j]))
        
      blocks[i][j]=term1 + term2 @ Kmat(f(j), f(j-1))

        
  # remove leftmost blocks (those are with respect to W[0] which is input)
  del blocks[0]
  for row in blocks:
    del row[0]
    
  #hess = u.concat_blocks(blocks)
  ihess = u.concat_blocks(u.block_diagonal_inverse(blocks))
  #  ihess = u.pseudo_inverse(hess)
  
  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * ihess @ dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  
  expected_losses = np.loadtxt("data/rotations_simple_newtonbd_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # from accompanying notebook
  # 0.0111498, 0.0000171591, 4.11445*10^-11, 2.33652*10^-22, 
  # 1.21455*10^-32,
 
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #29
0
def relu_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_relu_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [4,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0, name="X0")
  Y = tf.constant(Y0, name="Y0")
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    if i == 0:
      A[i+1] = X
    else:
      A[i+1] = tf.nn.relu(tf.matmul(W[i], A[i], name="A"+str(i+1)))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(0.1, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = (-err/dsize)*u.relu_mask(A[n+1])
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]
    if i > 0:  # there's no relu on first matrix
      B[i] = B[i]*u.relu_mask(A[i+1])

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_relu_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  
  # From accompanying notebook
  #  {0.407751, 0.0683822, 0.0138657, 0.0039221, 0.00203637, 0.00164892,
  #    0.00156137, 0.00153857, 0.00153051, 0.00152593}
  for i in range(10):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #30
0
def model_creator(batch_size, name="default", dtype=np.float32):
  """Create MNIST autoencoder model. Dataset is part of model."""

  model = Model(name)

  def get_batch_size(data):
    if isinstance(data, IndexedGrad):
      return int(data.live[0].shape[1])
    else:
      return int(data.shape[1])

  init_dict = {}
  global_vars = []
  local_vars = []
  
  # TODO: factor out to reuse between scripts
  # TODO: change feed_dict logic to reuse value provided to VarStruct
  # current situation makes reinitialization of global variable change
  # it's value, counterinituitive
  def init_var(val, name, is_global=False):
    """Helper to create variables with numpy or TF initial values."""
    if isinstance(val, tf.Tensor):
      var = u.get_variable(name=name, initializer=val, reuse=is_global)
    else:
      val = np.array(val)
      assert u.is_numeric(val), "Non-numeric type."
      
      var_struct = u.get_var(name=name, initializer=val, reuse=is_global)
      holder = var_struct.val_
      init_dict[holder] = val
      var = var_struct.var

    if is_global:
      global_vars.append(var)
    else:
      local_vars.append(var)
      
    return var

  # TODO: get rid of purely_relu
  def nonlin(x):
    if purely_relu:
      return tf.nn.relu(x)
    elif purely_linear:
      return tf.identity(x)
    else:
      return tf.sigmoid(x)

  # TODO: rename into "nonlin_d"
  def d_nonlin(y):
    if purely_relu:
      return u.relu_mask(y)
    elif purely_linear:
      return 1
    else: 
      return y*(1-y)

  patches = train_images[:,:args.batch_size];
  test_patches = test_images[:,:args.batch_size];

  if args.dataset == 'cifar':
    input_dim = 3*32*32
  elif args.dataset == 'mnist':
    input_dim = 28*28
  else:
    assert False
  fs = [args.batch_size, input_dim, 1024, 1024, 1024, 196, 1024, 1024, 1024,
        input_dim]
    
  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  n = len(fs) - 2

  # Full dataset from which new batches are sampled
  X_full = init_var(train_images, "X_full", is_global=True)

  X = init_var(patches, "X", is_global=False)  # stores local batch per model
  W = [None]*n
  W.insert(0, X)
  A = [None]*(n+2)
  A[1] = W[0]
  for i in range(1, n+1):
    init_val = ng_init(f(i), f(i-1)).astype(dtype)
    W[i] = init_var(init_val, "W_%d"%(i,), is_global=True)
    A[i+1] = nonlin(kfac_lib.matmul(W[i], A[i]))
  err = A[n+1] - A[1]
  model.loss = u.L2(err) / (2 * get_batch_size(err))

  # create test error eval
  layer0 = init_var(test_patches, "X_test", is_global=True)
  layer = layer0
  for i in range(1, n+1):
    layer = nonlin(W[i] @ layer)
  verr = (layer - layer0)
  model.vloss = u.L2(verr) / (2 * get_batch_size(verr))

  # manually compute backprop to use for sanity checking
  B = [None]*(n+1)
  B2 = [None]*(n+1)
  B[n] = err*d_nonlin(A[n+1])
  _sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
  if args.fixed_labels:
    _sampled_labels_live = tf.ones(shape=(f(n), f(-1)), dtype=dtype)
    
  _sampled_labels = init_var(_sampled_labels_live, "to_be_deleted",
                             is_global=False)

  B2[n] = _sampled_labels*d_nonlin(A[n+1])
  for i in range(n-1, -1, -1):
    backprop = t(W[i+1]) @ B[i+1]
    B[i] = backprop*d_nonlin(A[i+1])
    backprop2 = t(W[i+1]) @ B2[i+1]
    B2[i] = backprop2*d_nonlin(A[i+1])

  cov_A = [None]*(n+1)    # covariance of activations[i]
  cov_B2 = [None]*(n+1)   # covariance of synthetic backprops[i]
  vars_svd_A = [None]*(n+1)
  vars_svd_B2 = [None]*(n+1)
  dW = [None]*(n+1)
  dW2 = [None]*(n+1)
  pre_dW = [None]*(n+1)   # preconditioned dW
  # todo: decouple initial value from covariance update
  # maybe need start with identity and do running average
  for i in range(1,n+1):
    if regularized_svd:
      cov_A[i] = init_var(A[i]@t(A[i])/args.batch_size+args.Lambda*u.Identity(f(i-1)), "cov_A%d"%(i,))
      cov_B2[i] = init_var(B2[i]@t(B2[i])/args.batch_size+args.Lambda*u.Identity(f(i)), "cov_B2%d"%(i,))
    else:
      cov_A[i] = init_var(A[i]@t(A[i])/args.batch_size, "cov_A%d"%(i,))
      cov_B2[i] = init_var(B2[i]@t(B2[i])/args.batch_size, "cov_B2%d"%(i,))
    vars_svd_A[i] = u.SvdWrapper(cov_A[i],"svd_A_%d"%(i,))
    vars_svd_B2[i] = u.SvdWrapper(cov_B2[i],"svd_B2_%d"%(i,))
    if use_tikhonov:
      whitened_A = u.regularized_inverse3(vars_svd_A[i],L=args.Lambda) @ A[i]
      whitened_B2 = u.regularized_inverse3(vars_svd_B2[i],L=args.Lambda) @ B[i]
    else:
      whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i]
      whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i]
    
    dW[i] = (B[i] @ t(A[i]))/args.batch_size
    dW2[i] = B[i] @ t(A[i])
    pre_dW[i] = (whitened_B2 @ t(whitened_A))/args.batch_size

    
  sampled_labels_live = A[n+1] + tf.random_normal((f(n), f(-1)),
                                                  dtype=dtype, seed=0)
  if args.fixed_labels:
    sampled_labels_live = A[n+1]+tf.ones(shape=(f(n), f(-1)), dtype=dtype)
  sampled_labels = init_var(sampled_labels_live, "sampled_labels", is_global=False)
  err2 = A[n+1] - sampled_labels
  model.loss2 = u.L2(err2) / (2 * args.batch_size)
  model.global_vars = global_vars
  model.local_vars = local_vars
  model.trainable_vars = W[1:]

  # todo, we have 3 places where model step is tracked, reduce
  model.step = init_var(u.as_int32(0), "step", is_global=False)
  advance_step_op = model.step.assign_add(1)
  assert get_batch_size(X_full) % args.batch_size == 0
  batches_per_dataset = (get_batch_size(X_full) // args.batch_size)
  batch_idx = tf.mod(model.step, batches_per_dataset)
  start_idx = batch_idx * args.batch_size
  advance_batch_op = X.assign(X_full[:,start_idx:start_idx + args.batch_size])
  
  def advance_batch():
    print("Step for model(%s) is %s"%(model.name, u.eval(model.step)))
    sess = u.get_default_session()
    # TODO: get rid of _sampled_labels
    sessrun([sampled_labels.initializer, _sampled_labels.initializer])
    if args.advance_batch:
      with u.timeit("advance_batch"):
        sessrun(advance_batch_op)
    sessrun(advance_step_op)
    
  model.advance_batch = advance_batch

  # TODO: refactor this to take initial values out of Var struct
  #global_init_op = tf.group(*[v.initializer for v in global_vars])
  global_init_ops = [v.initializer for v in global_vars]
  global_init_op = tf.group(*[v.initializer for v in global_vars])
  global_init_query_ops = [tf.logical_not(tf.is_variable_initialized(v))
                           for v in global_vars]
  
  def initialize_global_vars(verbose=False, reinitialize=False):
    """If reinitialize is false, will not reinitialize variables already
    initialized."""
    
    sess = u.get_default_session()
    if not reinitialize:
      uninited = sessrun(global_init_query_ops)
      # use numpy boolean indexing to select list of initializers to run
      to_initialize = list(np.asarray(global_init_ops)[uninited])
    else:
      to_initialize = global_init_ops
      
    if verbose:
      print("Initializing following:")
      for v in to_initialize:
        print("   " + v.name)

    sessrun(to_initialize, feed_dict=init_dict)
  model.initialize_global_vars = initialize_global_vars

  # didn't quite work (can't initialize var in same run call as deps likely)
  # enforce that batch is initialized before everything
  # except fake labels opa
  # for v in local_vars:
  #   if v != X and v != sampled_labels and v != _sampled_labels:
  #     print("Adding dep %s on %s"%(v.initializer.name, X.initializer.name))
  #     u.add_dep(v.initializer, on_op=X.initializer)
      
  local_init_op = tf.group(*[v.initializer for v in local_vars],
                           name="%s_localinit"%(model.name))
  print("Local vars:")
  for v in local_vars:
    print(v.name)
    
  def initialize_local_vars():
    sess = u.get_default_session()
    sessrun(_sampled_labels.initializer, feed_dict=init_dict)
    sessrun(X.initializer, feed_dict=init_dict)
    sessrun(local_init_op, feed_dict=init_dict)
  model.initialize_local_vars = initialize_local_vars

  return model
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))

  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)

  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update

  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  lr_holder = tf.placeholder(dtype=dtype, shape=())
  lr = tf.Variable(lr_holder, dtype=dtype)

  # run tests
  do_run_iters = 5
  result = newton(1.0)
  expected_result = [8.9023744225439743e-05, 0.060120791316053412, 0.0059295249954177918, 1.9856240803246437e-05, 2.7125563957575423e-10]
Beispiel #32
0
    # A[n+1] = network output
    A = [None] * (n + 2)
    A[0] = u.Identity(dsize, dtype=dtype)
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    B = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        if i == 1:
            backprop += beta * d_kl(rho, rho_hat)
        B[i] = backprop * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    for i in range(n + 1):
        dW[i] = (B[i] @ t(A[i])) / dsize

    # Cost function
    reconstruction = u.L2(err) / (2 * dsize)
    sparsity = beta * tf.reduce_sum(kl(rho, rho_hat))
    L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1]))
    cost = reconstruction + sparsity + L2
Beispiel #33
0
    A[1] = W[0]
    for i in range(1, n + 1):
        A[i + 1] = sigmoid(W[i] @ A[i])

    # reconstruction error and sparsity error
    err = (A[3] - A[1])
    rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize

    # B[i] = backprops needed to compute gradient of W[i]
    # B2[i] = synthetic backprops for natural gradient
    B = [None] * (n + 1)
    B2 = [None] * (n + 1)
    B[n] = err * d_sigmoid(A[n + 1])
    B2[n] = tf.random_normal((f(n), f(-1)), dtype=dtype) * d_sigmoid(A[n + 1])
    for i in range(n - 1, -1, -1):
        backprop = t(W[i + 1]) @ B[i + 1]
        backprop2 = t(W[i + 1]) @ B2[i + 1]
        if i == 1:
            backprop += beta * d_kl(rho, rho_hat)
            backprop2 += beta * d_kl(rho, rho_hat)
        B[i] = backprop * d_sigmoid(A[i + 1])
        B2[i] = backprop2 * d_sigmoid(A[i + 1])

    # dW[i] = gradient of W[i]
    dW = [None] * (n + 1)
    dW2 = [None] * (n + 1)
    Acov = [None] * (n + 1)
    Bcov = [None] * (n + 1)  # empirical covariances
    Bcov2 = [None] * (n + 1)  # natural gradient sampled covariances
    whitenA = [None] * (n + 1)
    whitenB = [None] * (n + 1)
Beispiel #34
0
    A[i+1] = sigmoid(W[i] @ A[i])
    
  # reconstruction error and sparsity error
  err = (A[3] - A[1])
  rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True)/dsize

  # B[i] = backprops needed to compute gradient of W[i]
  # B2[i] = backprops from sampled labels needed for natural gradient
  B = [None]*(n+1)
  B2 = [None]*(n+1)
  B[n] = err*d_sigmoid(A[n+1])
  sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0)
  sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True)
  B2[n] = sampled_labels*d_sigmoid(A[n+1])
  for i in range(n-1, -1, -1):
    backprop = t(W[i+1]) @ B[i+1]
    backprop2 = t(W[i+1]) @ B2[i+1]
    if i == 1 and not drop_sparsity:
      backprop += beta*d_kl(rho, rho_hat)
      backprop2 += beta*d_kl(rho, rho_hat)
    B[i] = backprop*d_sigmoid(A[i+1])
    B2[i] = backprop2*d_sigmoid(A[i+1])

  # dW[i] = gradient of W[i]
  dW = [None]*(n+1)
  pre_dW = [None]*(n+1)  # preconditioned dW
  pre_dW_stable = [None]*(n+1)  # preconditioned stable dW

  cov_A = [None]*(n+1)    # covariance of activations[i]
  cov_B2 = [None]*(n+1)   # covariance of synthetic backprops[i]
  vars_svd_A = [None]*(n+1)
Beispiel #35
0
def kfac_optimizer(model_creator):
    stats_batch_size = 10000
    main_batch_size = 10000

    stats_model, loss, labels = model_creator(stats_batch_size)
    # replace labels_node with synthetic labels

    main_model, _, _ = model_creator(main_batch_size)

    opt = tf.GradientDescentOptimizer(0.2)
    grads_and_vars = opt.compute_gradients(loss)

    trainable_vars = tf.trainable_variables()

    # create SVD and preconditioning variables for matmul vars
    for var in trainable_vars:
        if var not in matmul_registry:
            continue
        dW = u.extract_grad(grads_and_vars, var)
        A[var] = get_activations(var)
        B[var] = get_backprops(var)
        B2[var] = get_backprops2(var)  # get backprops with synthetic labels
        dW[var] = B[var] @ t(A[var])  # todo: sort out dsize division
        cov_A[var] = init_var(A[var] @ t(A[var]) / dsize,
                              "cov_A_%s" % (var.name, ))
        cov_B2[var] = init_var(B2[var] @ t(B2[var]) / dsize,
                               "cov_B2_%s" % (var.name, ))

        vars_svd_A[var] = SvdWrapper(cov_A[var], "svd_A_%d" % (var.name, ))
        vars_svd_B2[var] = SvdWrapper(cov_B2[var], "svd_B2_%d" % (var.name, ))
        whitened_A = u.pseudo_inverse2(vars_svd_A[var]) @ A[var]
        whitened_B2 = u.pseudo_inverse2(vars_svd_B2[var]) @ B[var]
        whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[var]) @ A[var]
        whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[var]) @ B[var]

        pre_dW[var] = (whitened_B2 @ t(whitened_A)) / dsize
        pre_dW_stable[var] = (
            whitened_B2_stable @ t(whitened_A_stable)) / dsize
        dW[var] = (B[var] @ t(A[var])) / dsize

    # create update params ops

    # new_grads_and_vars = []
    # for grad, var in grads_and_vars:
    #   if var in kfac_registry:
    #     pre_A, pre_B = kfac_registry[var]
    #     new_grad_live = pre_B @ grad @ t(pre_A)
    #     new_grads_and_vars.append((new_grad, var))
    #     print("Preconditioning %s"%(var.name))
    #   else:
    #     new_grads_and_vars.append((grad, var))
    # train_op = opt.apply_gradients(new_grads_and_vars)

    # Each variable has an associated gradient, pre_gradient, variable save op
    def update_grad():
        ops = [grad_update_ops[var] for var in trainable_vars]
        sess.run(ops)

    def update_pre_grad():
        ops = [pre_grad_update_ops[var] for var in trainable_vars]
        sess.run(ops)

    def update_pre_grad2():
        ops = [pre_grad2_update_ops[var] for var in trainable_vars]
        sess.run(ops)

    def save_params():
        ops = [var_save_ops[var] for var in trainable_vars]
        sess.run(ops)

    for step in range(num_steps):
        update_covariances()
        if step % whitened_every_n_steps == 0:
            update_svds()

        update_grad()
        update_pre_grad()  # perf todo: update one of these
        update_pre_grad2()  # stable alternative

        lr0, loss0 = sess.run([lr, loss])
        save_params()

        # when grad norm<1, Fisher is unstable, switch to Sqrt(Fisher)
        # TODO: switch to per-matrix normalization
        stabilized_mode = grad_norm.eval() < 1

        if stabilized_mode:
            update_params2()
        else:
            update_params()

        loss1 = loss.eval()
        advance_batch()

        # line search stuff
        target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else
                        -pre_grad_stable_dot_grad.eval())
        target_delta = lr0 * target_slope
        actual_delta = loss1 - loss0
        actual_slope = actual_delta / lr0
        slope_ratio = actual_slope / target_slope  # between 0 and 1.01

        losses.append(loss0)
        step_lengths.append(lr0)
        ratios.append(slope_ratio)

        if step % report_frequency == 0:
            print(
                "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f"
                % (step, loss0, target_delta, actual_delta, slope_ratio,
                   grad_norm.eval(), pre_grad_norm.eval()))

        u.record_time()
Beispiel #36
0
def simple_gradient_test():
  tf.reset_default_graph()
  X0 = np.genfromtxt('data/rotations_simple_X0.csv',
                     delimiter= ",")
  Y0 = np.genfromtxt('data/rotations_simple_Y0.csv',
                     delimiter= ",")
  W0f = v2c_np(np.genfromtxt('data/rotations_simple_W0f.csv',
                            delimiter= ","))
  assert W0f.shape == (8, 1)
  
  fs = np.genfromtxt('data/rotations_simple_fs.csv',
                      delimiter= ",").astype(np.int32)
  n = len(fs)-2    # number of layers
  u.check_equal(fs, [10,2,2,2])

  def f(i): return fs[i+1]  # W[i] has shape f[i] x f[i-1]
  dsize = X0.shape[1]
  assert f(-1) == dsize
  
  # load W0f and do shape checks (can remove)
  W0s = u.unflatten_np(W0f, fs[1:])  # Wf doesn't have first layer (data matrix)
  W0s.insert(0, X0)
  Wf_holder = tf.placeholder(dtype, shape=W0f.shape)
  Wf = tf.Variable(Wf_holder, name="Wf")
  Wf_copy = tf.Variable(Wf_holder, name="Wf_copy")
  init_dict = {Wf_holder: W0f}

  # Create W's
  W = u.unflatten(Wf, fs[1:])
  X = tf.constant(X0)
  Y = tf.constant(Y0)
  W.insert(0, X)
  for (numpy_W, tf_W) in zip(W0s, W):
    u.check_equal(numpy_W.shape, u.fix_shape(tf_W.shape))

  # Create A's
  # A[1] == X
  A = [0]*(n+2)
  A[0] = u.Identity(dsize)
  for i in range(n+1):
    A[i+1] = tf.matmul(W[i], A[i], name="A"+str(i+1))


  assert W[0].get_shape() == X0.shape
  assert A[n+1].get_shape() == X0.shape
  assert A[1].get_shape() == X0.shape

  err = Y - A[n+1]
  loss = tf.reduce_sum(tf.square(err))/(2*dsize)
  lr = tf.Variable(1.0, dtype=dtype)
  
  # Create B's
  B = [0]*(n+1)
  B[n] = -err/dsize
  for i in range(n-1, -1, -1):
    B[i] = t(W[i+1]) @ B[i+1]

  # create dW's
  dW = [0]*(n+1)
  for i in range(n+1):
    dW[i] = tf.matmul(B[i], tf.transpose(A[i]), name="dW"+str(i))
  del dW[0]  # get rid of W[0] update
  
  dWf = tf.concat([u.vec(dWi) for dWi in dW], axis=0)
  Wf_new = Wf - lr * dWf 

  train_op1 = Wf_copy.assign(Wf_new)
  train_op2 = Wf.assign(Wf_copy)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer(), feed_dict=init_dict)
  
  expected_losses = np.loadtxt("data/rotations_simple_gradient_losses.csv",
                               delimiter= ",")
  observed_losses = []
  # from accompanying notebook
  # {0.0111498, 0.00694816, 0.00429464, 0.00248228, 0.00159361,
  #  0.000957424, 0.000651653, 0.000423802, 0.000306749, 0.00021772,
  for i in range(20):
    observed_losses.append(sess.run([loss])[0])
    sess.run(train_op1)
    sess.run(train_op2)

  u.check_equal(observed_losses, expected_losses)
Beispiel #37
0
  A = [None]*(n+2)
  A[0] = u.Identity(dsize, dtype=dtype)
  A[1] = W[0]
  for i in range(1, n+1):
    A[i+1] = sigmoid(W[i] @ A[i])
    

  # reconstruction error and sparsity error
  err = (A[3] - A[1])
  rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True)/dsize

  # B[i] = backprops needed to compute gradient of W[i]
  B = [None]*(n+1)
  B[n] = err*d_sigmoid(A[n+1])
  for i in range(n-1, -1, -1):
    backprop = t(W[i+1]) @ B[i+1]
    if i == 1:
      backprop += beta*d_kl(rho, rho_hat)
    B[i] = backprop*d_sigmoid(A[i+1])

  # dW[i] = gradient of W[i]
  dW = [None]*(n+1)
  for i in range(n+1):
    dW[i] = (B[i] @ t(A[i]))/dsize

  # Cost function
  reconstruction = u.L2(err) / (2 * dsize)
  sparsity = beta * tf.reduce_sum(kl(rho, rho_hat))
  L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1]))
  cost = reconstruction + sparsity + L2