def adam_minimax(grad_both, init_params_max, init_params_min, callback=None, num_iters=100,
         step_size_max=0.001, step_size_min=0.001, b1=0.9, b2=0.999, eps=10**-8):
    """Adam modified to do minimiax optimization, for instance to help with
    training generative adversarial networks."""

    x_max, unflatten_max = flatten(init_params_max)
    x_min, unflatten_min = flatten(init_params_min)

    m_max = np.zeros(len(x_max))
    v_max = np.zeros(len(x_max))
    m_min = np.zeros(len(x_min))
    v_min = np.zeros(len(x_min))
    for i in range(num_iters):
        g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                       unflatten_min(x_min), i)
        g_max, _ = flatten(g_max_uf)
        g_min, _ = flatten(g_min_uf)

        if callback: callback(unflatten_max(x_max), unflatten_min(x_min), i,
                              unflatten_max(g_max), unflatten_min(g_min))

        m_max = (1 - b1) * g_max      + b1 * m_max  # First  moment estimate.
        v_max = (1 - b2) * (g_max**2) + b2 * v_max  # Second moment estimate.
        mhat_max = m_max / (1 - b1**(i + 1))    # Bias correction.
        vhat_max = v_max / (1 - b2**(i + 1))
        x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps)

        m_min = (1 - b1) * g_min      + b1 * m_min  # First  moment estimate.
        v_min = (1 - b2) * (g_min**2) + b2 * v_min  # Second moment estimate.
        mhat_min = m_min / (1 - b1**(i + 1))    # Bias correction.
        vhat_min = v_min / (1 - b2**(i + 1))
        x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps)
    return unflatten_max(x_max), unflatten_min(x_min)
Beispiel #2
0
def adam_minmin(grad_both, init_params_nn, init_params_nn2, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8):
  x_nn, unflatten_nn = flatten(init_params_nn)
  x_nn2, unflatten_nn2 = flatten(init_params_nn2)

  m_nn, v_nn = np.zeros(len(x_nn)), np.zeros(len(x_nn))
  m_nn2, v_nn2 = np.zeros(len(x_nn2)), np.zeros(len(x_nn2))
  for i in range(num_iters):
    g_nn_uf, g_nn2_uf = grad_both(unflatten_nn(x_nn), unflatten_nn2(x_nn2), i)
    g_nn, _ = flatten(g_nn_uf)
    g_nn2, _ = flatten(g_nn2_uf)

    if callback: 
      callback(unflatten_nn(x_nn), unflatten_nn2(x_nn2), i)
    
    step_size = exponential_decay(step_size)

    # Update parameters
    m_nn = (1 - b1) * g_nn      + b1 * m_nn  # First  moment estimate.
    v_nn = (1 - b2) * (g_nn**2) + b2 * v_nn  # Second moment estimate.
    mhat_nn = m_nn / (1 - b1**(i + 1))    # Bias correction.
    vhat_nn = v_nn / (1 - b2**(i + 1))
    x_nn = x_nn - step_size * mhat_nn / (np.sqrt(vhat_nn) + eps)

    # Update parameters
    m_nn2 = (1 - b1) * g_nn2      + b1 * m_nn2  # First  moment estimate.
    v_nn2 = (1 - b2) * (g_nn2**2) + b2 * v_nn2  # Second moment estimate.
    mhat_nn2 = m_nn2 / (1 - b1**(i + 1))    # Bias correction.
    vhat_nn2 = v_nn2 / (1 - b2**(i + 1))
    x_nn2 = x_nn2 - step_size * mhat_nn2 / (np.sqrt(vhat_nn2) + eps)
  return unflatten_nn(x_nn), unflatten_nn2(x_nn2)
Beispiel #3
0
def test_flatten():
    val = (npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0,
                                                            npr.randn(2)]))
    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
    assert np.all(vect == vect_2)
Beispiel #4
0
def flatmap(f, container):
    flatten = lambda lst: [item for sublst in lst for item in sublst]
    mappers = {
        np.ndarray: lambda f, arr: f(arr),
        list: lambda f, lst: flatten(map(f, lst)),
        dict: lambda f, dct: flatten(map(f, dct.values()))
    }
    return mappers[type(container)](f, container)
def adam_minimax(grad_both,
                 init_params_max,
                 init_params_min,
                 callback=None,
                 num_iters=100,
                 step_size_max=0.001,
                 step_size_min=0.001,
                 b1=0.9,
                 b2=0.999,
                 eps=10**-8):
    """Adam modified to do minimiax optimization, for instance to help with
    training generative adversarial networks."""

    x_max, unflatten_max = flatten(init_params_max)
    x_min, unflatten_min = flatten(init_params_min)

    m_max = np.zeros(len(x_max))
    v_max = np.zeros(len(x_max))
    m_min = np.zeros(len(x_min))
    v_min = np.zeros(len(x_min))
    ability = 0
    HANDICAP = 100
    for i in range(num_iters):
        g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                       unflatten_min(x_min), i)
        g_max, _ = flatten(g_max_uf)
        g_min, _ = flatten(g_min_uf)

        if callback:
            callback(unflatten_max(x_max), unflatten_min(x_min), i,
                     unflatten_max(g_max), unflatten_min(g_min))
            if i % 10 == 0:
                ability = objective(unflatten_max(x_max), unflatten_min(x_min),
                                    i)
        if ability < HANDICAP:
            m_max = (1 - b1) * g_max + b1 * m_max  # First  moment estimate.
            v_max = (1 - b2) * (g_max**
                                2) + b2 * v_max  # Second moment estimate.
            mhat_max = m_max / (1 - b1**(i + 1))  # Bias correction.
            vhat_max = v_max / (1 - b2**(i + 1))
            x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) +
                                                        eps)
        else:
            print('Skipping generator update because objective is too high')

        if ability > -HANDICAP:
            m_min = (1 - b1) * g_min + b1 * m_min  # First  moment estimate.
            v_min = (1 - b2) * (g_min**
                                2) + b2 * v_min  # Second moment estimate.
            mhat_min = m_min / (1 - b1**(i + 1))  # Bias correction.
            vhat_min = v_min / (1 - b2**(i + 1))
            x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) +
                                                        eps)
        else:
            print('Skipping discriminator update because objective is too low')
    return unflatten_max(x_max), unflatten_min(x_min)
def test_flatten_dict():
    val = {'k':  npr.random((4, 4)),
           'k2': npr.random((3, 3)),
           'k3': 3.0,
           'k4': [1.0, 4.0, 7.0, 9.0]}

    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
    assert np.all(vect == vect_2)
Beispiel #7
0
def time_flatten():
    val = {'k':  npr.random((4, 4)),
           'k2': npr.random((3, 3)),
           'k3': 3.0,
           'k4': [1.0, 4.0, 7.0, 9.0],
           'k5': np.array([4., 5., 6.]),
           'k6': np.array([[7., 8.], [9., 10.]])}

    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
Beispiel #8
0
def time_flatten():
    val = {
        'k': npr.random((4, 4)),
        'k2': npr.random((3, 3)),
        'k3': 3.0,
        'k4': [1.0, 4.0, 7.0, 9.0],
        'k5': np.array([4., 5., 6.]),
        'k6': np.array([[7., 8.], [9., 10.]])
    }

    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
Beispiel #9
0
def make_gradfun(run_inference,
                 pgm_prior,
                 data,
                 batch_size,
                 num_samples,
                 natgrad_scale=1.,
                 callback=callback):
    _, unflat = flatten(pgm_prior)
    num_datapoints = get_num_datapoints(data)
    data_batches, num_batches = split_into_batches(data, batch_size)
    get_batch = lambda i: data_batches[i % num_batches]
    saved = lambda: None

    def mc_elbo(pgm_params, i):
        #Here nn_potentials are just the sufficient stats of the data
        x = get_batch(i)
        xxT = np.einsum('ij,ik->ijk', x, x)
        n = np.ones(x.shape[0]) if x.ndim == 2 else 1.
        nn_potentials = pack_dense(xxT, x, n, n)
        saved.stats, global_kl, local_kl = run_inference(
            pgm_prior, pgm_params, nn_potentials)
        return (-global_kl - num_batches * local_kl) / num_datapoints  #CHECK

    def gradfun(params, i):
        pgm_params = params
        val = -mc_elbo(pgm_params, i)
        pgm_natgrad = -natgrad_scale / num_datapoints * \
                      (flat(pgm_prior) + num_batches*flat(saved.stats) - flat(pgm_params))
        #print(flat(pgm_prior), num_batches*flat(saved.stats), -flat(pgm_params))
        grad = unflat(pgm_natgrad)
        if callback: callback(i, val, params, grad)
        return grad

    return gradfun
def question4b3(m, train_x, train_y_integers):
    # Number of hidden units
    dims_hid = m
    # Compress all weights into one weight vector using autograd's flatten
    x_train, x_test, y_train_integers, y_test_integers = train_test_split(
        train_x, train_y_integers, test_size=0.2, train_size=0.8)
    y_train = np.zeros((x_train.shape[0], 4))
    y_train[np.arange(x_train.shape[0]), y_train_integers] = 1
    y_test = np.zeros((x_test.shape[0], 4))
    y_test[np.arange(x_test.shape[0]), y_test_integers] = 1

    W = np.random.randn(x_train.shape[1], dims_hid)
    b = np.random.randn(dims_hid)
    V = np.random.randn(dims_hid, 4)
    c = np.random.randn(4)

    all_weights = (W, b, V, c)
    weights, unflatten = flatten(all_weights)
    smooth_grad = 0

    for i in range(1000):
        weight_gradients, returned_values = grad_fun(weights, x_train, y_train,
                                                     unflatten)
        smooth_grad = (1 -
                       momentum) * smooth_grad + momentum * weight_gradients
        weights = weights - epsilon * smooth_grad

    return mean_zero_one_loss(weights, x_test, y_test_integers, unflatten)
Beispiel #11
0
def make_gradfun(run_inference, recognize, loglike, pgm_prior, data,
                 batch_size, num_samples, natgrad_scale=1., callback=callback):
    _, unflat = flatten(pgm_prior)
    num_datapoints = get_num_datapoints(data)
    data_batches, num_batches = split_into_batches(data, batch_size)
    get_batch = lambda i: data_batches[i % num_batches]
    saved = lambda: None

    def mc_elbo(pgm_params, loglike_params, recogn_params, i):
        nn_potentials = recognize(recogn_params, get_batch(i))
        samples, saved.stats, global_kl, local_kl = \
            run_inference(pgm_prior, pgm_params, nn_potentials, num_samples)
        return (num_batches * loglike(loglike_params, samples, get_batch(i))
                - global_kl - num_batches * local_kl) / num_datapoints

    def gradfun(params, i):
        pgm_params, loglike_params, recogn_params = params
        objective = lambda (loglike_params, recogn_params): \
            -mc_elbo(pgm_params, loglike_params, recogn_params, i)
        val, (loglike_grad, recogn_grad) = vgrad(objective)((loglike_params, recogn_params))
        # this expression for pgm_natgrad drops a term that can be computed using
        # the function autograd.misc.fixed_points.fixed_point
        pgm_natgrad = -natgrad_scale / num_datapoints * \
            (flat(pgm_prior) + num_batches*flat(saved.stats) - flat(pgm_params))
        grad = unflat(pgm_natgrad), loglike_grad, recogn_grad
        if callback: callback(i, val, params, grad)
        return grad

    return gradfun
def adam_minimax(grad_both,
                 init_params_max,
                 init_params_min,
                 callback=None,
                 num_iters=100,
                 step_size_max=0.001,
                 step_size_min=0.001,
                 b1=0.9,
                 b2=0.999,
                 eps=10**-8):
    """Adam modified to do minimiax optimization, for instance to help with
    training generative adversarial networks."""

    x_max, unflatten_max = flatten(init_params_max)
    x_min, unflatten_min = flatten(init_params_min)

    m_max = np.zeros(len(x_max))
    v_max = np.zeros(len(x_max))
    m_min = np.zeros(len(x_min))
    v_min = np.zeros(len(x_min))
    for i in range(num_iters):
        g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                       unflatten_min(x_min), i)
        g_max, _ = flatten(g_max_uf)
        g_min, _ = flatten(g_min_uf)

        if callback:
            callback(unflatten_max(x_max), unflatten_min(x_min), i,
                     unflatten_max(g_max), unflatten_min(g_min))

        m_max = (1 - b1) * g_max + b1 * m_max  # First  moment estimate.
        v_max = (1 - b2) * (g_max**2) + b2 * v_max  # Second moment estimate.
        mhat_max = m_max / (1 - b1**(i + 1))  # Bias correction.
        vhat_max = v_max / (1 - b2**(i + 1))
        x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps)

        m_min = (1 - b1) * g_min + b1 * m_min  # First  moment estimate.
        v_min = (1 - b2) * (g_min**2) + b2 * v_min  # Second moment estimate.
        mhat_min = m_min / (1 - b1**(i + 1))  # Bias correction.
        vhat_min = v_min / (1 - b2**(i + 1))
        x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps)
    return unflatten_max(x_max), unflatten_min(x_min)
    def __init__(self, params, predict, inputs, targets):
        """Construct a Model object given a prediction function."""
        self.__params = params
        self.__params_flat, self.unflatten_params = flatten(self.params)
        self.predict = predict
        self.inputs = inputs
        self.targets = targets

        self.gradient = autograd.grad(self.loss)
        self.hessian = autograd.hessian(self.loss)
        self.hess_dot_vec = autograd.hessian_vector_product(self.loss)
        self.grad_rayleigh = autograd.grad(self.rayleigh_quotient)
Beispiel #14
0
def unflatten_tracing():
    val = [
        npr.randn(4), [npr.randn(3, 4), 2.5], (), (2.0, [1.0,
                                                         npr.randn(2)])
    ]
    vect, unflatten = flatten(val)

    def f(vect):
        return unflatten(vect)

    flatten2, _ = make_vjp(f)(vect)
    assert np.all(vect == flatten2(val))
Beispiel #15
0
def time_grad_flatten():
    val = {'k':  npr.random((4, 4)),
           'k2': npr.random((3, 3)),
           'k3': 3.0,
           'k4': [1.0, 4.0, 7.0, 9.0],
           'k5': np.array([4., 5., 6.]),
           'k6': np.array([[7., 8.], [9., 10.]])}

    vect, unflatten = flatten(val)
    def fun(vec):
        v = unflatten(vec)
        return np.sum(v['k5']) + np.sum(v['k6'])

    grad(fun)(vect)
Beispiel #16
0
def time_grad_flatten():
    val = {
        'k': npr.random((4, 4)),
        'k2': npr.random((3, 3)),
        'k3': 3.0,
        'k4': [1.0, 4.0, 7.0, 9.0],
        'k5': np.array([4., 5., 6.]),
        'k6': np.array([[7., 8.], [9., 10.]])
    }

    vect, unflatten = flatten(val)

    def fun(vec):
        v = unflatten(vec)
        return np.sum(v['k5']) + np.sum(v['k6'])

    grad(fun)(vect)
Beispiel #17
0
def init_params(scale, rs=npr.RandomState(0)):
    w = range(4)

    # LeNet:    20-50-500
    #           10-20-(320)-128-10
    w[0] = (scale * rs.randn(1, 10, 5, 5).astype(dtype),
            scale * rs.randn(1, 10, 1, 1).astype(dtype))
    w[1] = (scale * rs.randn(10, 20, 5, 5).astype(dtype),
            scale * rs.randn(1, 20, 1, 1).astype(dtype))
    w[2] = (scale * rs.randn(320, 128).astype(dtype),
            scale * rs.randn(128).astype(dtype))
    w[3] = (scale * rs.randn(128, 10).astype(dtype),
            scale * rs.randn(10).astype(dtype))

    t1, _ = flatten(w)
    print '[size]: ', t1.shape
    return w
Beispiel #18
0
def nnOneLayerTrainEntry():
    data = read_image_data()
    train_x = data[0]
    train_y_integers = data[1]
    test_x = data[2]

    # Make inputs approximately zero mean (improves optimization backprob algorithm in NN)
    train_x -= .5
    test_x -= .5

    # Number of output dimensions
    dims_out = 4
    # Number of hidden units
    dims_hid_list = [5, 40, 70]  #5
    # Learning rate
    epsilon = 0.0001
    # Momentum of gradients update
    momentum = 0.1
    # Number of epochs
    nEpochs = 1000  #10
    # Number of train examples
    nTrainSamples = train_x.shape[0]
    # Number of input dimensions
    dims_in = train_x.shape[1]

    # Convert integer labels to one-hot vectors
    # i.e. convert label 2 to 0, 0, 1, 0
    train_y = np.zeros((nTrainSamples, dims_out))
    train_y[np.arange(nTrainSamples), train_y_integers] = 1

    print("trainy shape: ", train_y.shape)

    assert momentum <= 1
    assert epsilon <= 1

    xnEpochsLst = range(1, nEpochs + 1, 1)
    yLossLst = []
    for dims_hid in dims_hid_list:
        trainStart = time.time() * 1000

        # Initializing weights
        W = np.random.randn(dims_in, dims_hid)
        b = np.random.randn(dims_hid)
        V = np.random.randn(dims_hid, dims_out)
        c = np.random.randn(dims_out)
        smooth_grad = 0
        # Compress all weights into one weight vector using autograd's flatten
        all_weights = (W, b, V, c)
        weights, unflatten = flatten(all_weights)
        yLossInns = []
        for epo in xnEpochsLst:  #range(0, nEpochs):
            smooth_grad, weights, meanLogisticloss, meanZeroOneLoss = trainNN(
                epsilon, momentum, train_x, train_y, train_y_integers, weights,
                unflatten, smooth_grad)
            yLossInns.append(meanLogisticloss)
        yLossLst.append(yLossInns)
        #print ("YLossLsttttttt: ", yLossLst)
        print("NN time for different M: ", dims_hid,
              time.time() * 1000 - trainStart)
    labels = ["M = " + str(dims_hid) for dims_hid in dims_hid_list]
    #print('Train yLossInns =', xnEpochsLst, yLossLst)
    plotNN(xnEpochsLst, yLossLst, labels)
Beispiel #19
0
def stratifyDataTrainTestNN():
    data = read_image_data()
    train_x = data[0]
    train_y_integers = data[1]
    test_x = data[2]

    # Make inputs approximately zero mean (improves optimization backprob algorithm in NN)
    train_x -= .5
    test_x -= .5

    dims_out = 4

    xsplitTrain, xsplitTest, ysplitTrain_integer, ysplitTest_integer = train_test_split(
        train_x,
        train_y_integers,
        test_size=0.2,
        random_state=0,
        stratify=train_y_integers)

    dims_in = xsplitTrain.shape[1]
    nTrainSamples = xsplitTrain.shape[0]
    ysplitTrain = np.zeros((nTrainSamples, dims_out))
    ysplitTrain[np.arange(nTrainSamples), ysplitTrain_integer] = 1

    # Learning rate
    epsilon = 0.0001
    # Momentum of gradients update
    momentum = 0.1
    dims_hid_list = [5, 40, 70]
    nEpochs = 1000
    xnEpochsLst = range(1, nEpochs + 1, 1)

    smallestValidationError = 2**32
    bestParas = []
    best_dims_hid = 0
    for dims_hid in dims_hid_list:

        # Initializing weights
        W = np.random.randn(dims_in, dims_hid)
        b = np.random.randn(dims_hid)
        V = np.random.randn(dims_hid, dims_out)
        c = np.random.randn(dims_out)
        smooth_grad = 0
        # Compress all weights into one weight vector using autograd's flatten
        all_weights = (W, b, V, c)
        weights, unflatten = flatten(all_weights)
        meanZeroOneLoss = 0

        for epo in xnEpochsLst:  #range(0, nEpochs):
            smooth_grad, weights, meanLogisticloss, meanZeroOneLoss = trainNN(
                epsilon, momentum, xsplitTrain, ysplitTrain,
                ysplitTrain_integer, weights, unflatten, smooth_grad)

        #get validation data set zero-one-loss-error
        zeroOnelossEach = mean_zero_one_loss(weights, xsplitTest,
                                             ysplitTest_integer, unflatten)
        print("zeroOnelossEach: ", zeroOnelossEach)
        if zeroOnelossEach < smallestValidationError:
            smallestValidationError = zeroOnelossEach
            bestParas = [weights, unflatten, smooth_grad]
            best_dims_hid = dims_hid

    print("smallestValidationError: ", smallestValidationError, "M = ",
          best_dims_hid)
    #train whole data
    nTrainSamples = train_x.shape[0]
    train_y = np.zeros((nTrainSamples, dims_out))
    train_y[np.arange(nTrainSamples), train_y_integers] = 1

    weights = bestParas[0]
    unflatten = bestParas[1]
    smooth_grad = bestParas[2]

    smooth_grad, weights, meanLogisticloss, meanZeroOneLoss = trainNN(
        epsilon, momentum, train_x, train_y, train_y_integers, weights,
        unflatten, smooth_grad)

    fileTestOutputNN = "../Predictions/best_NN2.csv"

    testDataOutputFile(weights, test_x, unflatten, fileTestOutputNN)
Beispiel #20
0
def log_gaussian(params, scale):
    flat_params, _ = flatten(params)
    return np.sum(norm.logpdf(flat_params, 0, scale))
for dims_hid in dims_hids:

    print("unit: ", dims_hid)
    start = time.time()
    mean_loss = []

    # Initializing weights
    W = np.random.randn(dims_in, dims_hid)
    b = np.random.randn(dims_hid)
    V = np.random.randn(dims_hid, dims_out)
    c = np.random.randn(dims_out)
    smooth_grad = 0

    # Compress all weights into one weight vector using autograd's flatten
    all_weights = (W, b, V, c)
    weights, unflatten = flatten(all_weights)

    for i in range(nEpochs):
        # Compute gradients (partial derivatives) using autograd toolbox
        weight_gradients, returned_values = grad_fun(weights, X_train, train_y,
                                                     unflatten)
        #print('logistic loss: ', returned_values[0], 'Train error =', returned_values[1])
        mean = returned_values[0] / nTrainSamples
        #print('logistic loss: ',mean)
        mean_loss.append(mean)

        # Update weight vector
        smooth_grad = (1 -
                       momentum) * smooth_grad + momentum * weight_gradients
        weights = weights - epsilon * smooth_grad
Beispiel #22
0
def test_flatten():
    val = (npr.randn(4), [npr.randn(3,4), 2.5], (), (2.0, [1.0, npr.randn(2)]))
    vect, unflatten = flatten(val)
    val_recovered = unflatten(vect)
    vect_2, _ = flatten(val_recovered)
    assert np.all(vect == vect_2)
def test_flatten_complex():
    val = 1 + 1j
    flat, unflatten = flatten(val)
    assert np.all(val == unflatten(flat))
def adam_minimax(grad_both,
                 init_params_max,
                 init_params_min,
                 neighbors_function,
                 callback=None,
                 num_iters=100,
                 step_size_max=0.001,
                 step_size_min=0.001,
                 b1=0.9,
                 b2=0.999,
                 eps=10**-8):
    """Adam modified to do minimiax optimization, for instance to help with
  training generative adversarial networks."""
    def exponential_decay(step_size_max):
        if step_size_max > 0.001:
            step_size_max *= 0.999
        return step_size_max

    x_max, unflatten_max = flatten(init_params_max)
    x_min, unflatten_min = flatten(init_params_min)

    m_max = np.zeros(len(x_max))
    v_max = np.zeros(len(x_max))
    m_min = np.zeros(len(x_min))
    v_min = np.zeros(len(x_min))

    # gp_fold = '/cluster/mshen/prj/gans/out/2017-06-19/c_gan/ajc/gen_params/'
    # iter_nm = 'akb'
    # genZ_params = import_ganZ_gen_params(gp_fold, iter_nm)
    # x_max, unflatten_max = flatten(genZ_params)
    # i = 0
    # g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function)
    # g_max, _ = flatten(g_max_uf)
    # g_min, _ = flatten(g_min_uf)
    # dnow = datetime.datetime.now(); g_max_uf, g_min_uf = grad_both(unflatten_max(x_max), unflatten_min(x_min), i, neighbors_function); print(datetime.datetime.now() - dnow)
    # import code; code.interact(local=dict(globals(), **locals()))

    for i in range(num_iters):
        print(i, datetime.datetime.now(), alphabetize(i))
        # if i % 5 == 0 and i % 10 != 1:
        # K = 10
        # if i % 5 == 0:
        # K = 10
        # else:
        # K = 1
        K = 3

        if i == 10:
            # Once entropy is done learning, reduce step size
            step_size_max = 0.01
            # import code; code.interact(local=dict(globals(), **locals()))

        g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                       unflatten_min(x_min), i,
                                       neighbors_function)
        g_max, _ = flatten(g_max_uf)
        g_min, _ = flatten(g_min_uf)

        if callback:
            callback(unflatten_max(x_max), unflatten_min(x_min), i,
                     unflatten_max(g_max), unflatten_min(g_min))

        step_size_max = exponential_decay(step_size_max)

        # Update generator (maximizer)
        m_max = (1 - b1) * g_max + b1 * m_max  # First  moment estimate.
        v_max = (1 - b2) * (g_max**2) + b2 * v_max  # Second moment estimate.
        mhat_max = m_max / (1 - b1**(i + 1))  # Bias correction.
        vhat_max = v_max / (1 - b2**(i + 1))
        x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps)

        # Update discriminator (minimizer)
        m_min = (1 - b1) * g_min + b1 * m_min  # First  moment estimate.
        v_min = (1 - b2) * (g_min**2) + b2 * v_min  # Second moment estimate.
        mhat_min = m_min / (1 - b1**(i + 1))  # Bias correction.
        vhat_min = v_min / (1 - b2**(i + 1))
        x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps)

        for k in range(K - 1):
            if k <= 0:
                step_size_min_temp = step_size_min
            if k > 0:
                step_size_min_temp = step_size_min_temp * 0.50
            g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                           unflatten_min(x_min), i,
                                           neighbors_function)
            g_min, _ = flatten(g_min_uf)

            # Update discriminator (minimizer)
            m_min = (1 - b1) * g_min + b1 * m_min  # First  moment estimate.
            v_min = (1 - b2) * (g_min**
                                2) + b2 * v_min  # Second moment estimate.
            mhat_min = m_min / (1 - b1**(i + 1))  # Bias correction.
            vhat_min = v_min / (1 - b2**(i + 1))
            x_min = x_min - step_size_min_temp * mhat_min / (
                np.sqrt(vhat_min) + eps)

    return unflatten_max(x_max), unflatten_min(x_min)
Beispiel #25
0
from __future__ import division, print_function
from toolz import curry
from autograd import value_and_grad as vgrad
from autograd.util import flatten
from util import split_into_batches, get_num_datapoints

callback = lambda i, val, params, grad: print('{}: {}'.format(i, val))
flat = lambda struct: flatten(struct)[0]

@curry
def make_gradfun(run_inference, recognize, loglike, pgm_prior, data,
                 batch_size, num_samples, natgrad_scale=1., callback=callback):
    _, unflat = flatten(pgm_prior)
    num_datapoints = get_num_datapoints(data)
    data_batches, num_batches = split_into_batches(data, batch_size)
    get_batch = lambda i: data_batches[i % num_batches]
    saved = lambda: None

    def mc_elbo(pgm_params, loglike_params, recogn_params, i):
        nn_potentials = recognize(recogn_params, get_batch(i))
        samples, saved.stats, global_kl, local_kl = \
            run_inference(pgm_prior, pgm_params, nn_potentials, num_samples)
        return (num_batches * loglike(loglike_params, samples, get_batch(i))
                - global_kl - num_batches * local_kl) / num_datapoints

    def gradfun(params, i):
        pgm_params, loglike_params, recogn_params = params
        objective = lambda (loglike_params, recogn_params): \
            -mc_elbo(pgm_params, loglike_params, recogn_params, i)
        val, (loglike_grad, recogn_grad) = vgrad(objective)((loglike_params, recogn_params))
        # this expression for pgm_natgrad drops a term that can be computed using
Beispiel #26
0
def adam_minimax(grad_both,
                 init_params_max,
                 init_params_min,
                 neighbors_function,
                 callback=None,
                 num_iters=100,
                 step_size_max=0.001,
                 step_size_min=0.001,
                 b1=0.9,
                 b2=0.999,
                 eps=10**-8):
    """Adam modified to do minimiax optimization, for instance to help with
  training generative adversarial networks."""
    def exponential_decay(step_size_min, step_size_max):
        if step_size_min > 0.0001:
            step_size_min *= 0.99
        if step_size_max > 0.001:
            step_size_max *= 0.99
        return step_size_min, step_size_max

    x_max, unflatten_max = flatten(init_params_max)
    x_min, unflatten_min = flatten(init_params_min)

    m_max = np.zeros(len(x_max))
    v_max = np.zeros(len(x_max))
    m_min = np.zeros(len(x_min))
    v_min = np.zeros(len(x_min))

    K = 1

    for i in range(num_iters):
        g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                       unflatten_min(x_min), i,
                                       neighbors_function)
        g_max, _ = flatten(g_max_uf)
        g_min, _ = flatten(g_min_uf)

        if callback:
            callback(unflatten_max(x_max), unflatten_min(x_min), i,
                     unflatten_max(g_max), unflatten_min(g_min))

        step_size_min, step_size_max = exponential_decay(
            step_size_min, step_size_max)

        # Update generator (maximizer)
        m_max = (1 - b1) * g_max + b1 * m_max  # First  moment estimate.
        v_max = (1 - b2) * (g_max**2) + b2 * v_max  # Second moment estimate.
        mhat_max = m_max / (1 - b1**(i + 1))  # Bias correction.
        vhat_max = v_max / (1 - b2**(i + 1))
        x_max = x_max + step_size_max * mhat_max / (np.sqrt(vhat_max) + eps)

        # Update discriminator (minimizer)
        m_min = (1 - b1) * g_min + b1 * m_min  # First  moment estimate.
        v_min = (1 - b2) * (g_min**2) + b2 * v_min  # Second moment estimate.
        mhat_min = m_min / (1 - b1**(i + 1))  # Bias correction.
        vhat_min = v_min / (1 - b2**(i + 1))
        x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) + eps)

        for k in range(K - 1):
            g_max_uf, g_min_uf = grad_both(unflatten_max(x_max),
                                           unflatten_min(x_min), i,
                                           neighbors_function)
            g_min, _ = flatten(g_min_uf)

            # Update discriminator (minimizer)
            m_min = (1 - b1) * g_min + b1 * m_min  # First  moment estimate.
            v_min = (1 - b2) * (g_min**
                                2) + b2 * v_min  # Second moment estimate.
            mhat_min = m_min / (1 - b1**(i + 1))  # Bias correction.
            vhat_min = v_min / (1 - b2**(i + 1))
            x_min = x_min - step_size_min * mhat_min / (np.sqrt(vhat_min) +
                                                        eps)

    return unflatten_max(x_max), unflatten_min(x_min)
Beispiel #27
0
def l1_norm(params):
    if isinstance(params, dict):
        return np.sum(np.absolute(flatten(params)[0]))
    return np.sum(np.absolute(flatten(params.value)[0]))
 def params(self, params):
     self.__params = params
     self.__params_flat, self.unflatten_params = flatten(self.__params)
Beispiel #29
0
def run_variational_inference_gumbel(Ys,
                                     A,
                                     W_true,
                                     Ps_true,
                                     Cs,
                                     etasq,
                                     stepsize=0.1,
                                     init_with_true=True,
                                     num_iters=250,
                                     temp_prior=0.1,
                                     num_sinkhorn=20,
                                     num_mcmc_samples=500,
                                     temp=1):
    def sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn, temp):

        # Sample W
        mu_W, log_sigmasq_W, log_mu_Ps = params
        W_flat = mu_W + np.sqrt(np.exp(log_sigmasq_W)) * npr.randn(*mu_W.shape)

        W = unpack_W(W_flat)
        #W = W_true
        # Sample Ps: run sinkhorn to move mu close to Birkhoff
        Ps = []
        for log_mu_P , unpack_P, C in \
                zip(log_mu_Ps,  unpack_Ps, Cs):
            # Unpack the mean, run sinkhorn, the pack it again
            log_mu_P = unpack_P(log_mu_P)
            a = log_mu_P.shape
            log_mu_P = (
                log_mu_P +
                -np.log(-np.log(np.random.uniform(0, 1, (a[0], a[1]))))) / temp

            log_mu_P = sinkhorn_logspace(log_mu_P - 1e8 * (1 - C),
                                         num_sinkhorn)
            log_mu_P = log_mu_P[C]

            ##Notice how we limit the variance
            P = np.exp(log_mu_P)
            P = unpack_P(P)

            Ps.append(P)

        Ps = np.array(Ps)
        return W, Ps

    def elbo(params, unpack_W, unpack_Ps, Ys, A, Cs, etasq, num_sinkhorn,
             num_mcmc_samples, temp_prior, temp):
        """
        Provides a stochastic estimate of the variational lower bound.
        sigma_Lim: limits for the variance of the re-parameterization of the permutation
        """
        def gumbel_distance(log_mu_Ps, temp_prior, temperature, Cs):
            arr = 0
            for n in range(len(log_mu_Ps)):
                log_mu_P = unpack_Ps[n](log_mu_Ps[n])
                C = Cs[n]
                log_mu_P = log_mu_P[C]
                log_mu_P = log_mu_P[:]
                arr += np.sum(
                    np.log(temp_prior) -
                    0.5772156649 * temp_prior / temperature -
                    log_mu_P * temp_prior / temperature - np.exp(
                        gammaln(1 + temp_prior / temperature) -
                        log_mu_P * temp_prior / temperature) -
                    (np.log(temperature) - 1 - 0.5772156649))
            return arr

        M, T, N = Ys.shape
        assert A.shape == (N, N)
        assert len(unpack_Ps) == M

        mu_W, log_sigmasq_W, log_mu_Ps = params

        L = 0

        for smpl in range(num_mcmc_samples):
            W, Ps = sample_q(params, unpack_W, unpack_Ps, Cs, num_sinkhorn,
                             temp)

            # Compute the ELBO
            L += log_likelihood(Ys, A, W, Ps, etasq) / num_mcmc_samples

            L += gumbel_distance(log_mu_Ps, temp_prior, temp, Cs)
        # Add the entropy terms

        L += gaussian_entropy(log_sigmasq_W)
        fac = 1000
        ## This terms adds the KL divergence between the W prior and posterior with entries of W having a prior variance
        # sigma = 1/fac, for details see the appendix of the VAE paper.

        L += - 0.5 * log_sigmasq_W.size * (np.log(2 * np.pi)) -\
             0.5 * fac* np.sum(np.exp(log_sigmasq_W)) - 0.5 * fac * np.sum(
            np.power(mu_W, 2))
        # Normalize objective

        L /= (T * M * N)

        return L

    M, T, N = Ys.shape
    # Initialize variational parameters
    if init_with_true:
        mu_W, log_sigmasq_W, unpack_W, log_mu_Ps,  unpack_Ps = \
            initialize_params_gumbel(A, Cs,  map_W=W_true)
    else:
        mu_W, log_sigmasq_W, unpack_W, log_mu_Ps, unpack_Ps = \
            initialize_params_gumbel(A, Cs)

    # Make a function to convert an array of params into
    # a set of parameters mu_W, sigmasq_W, [mu_P1, sigmasq_P1, ... ]
    flat_params, unflatten = \
        flatten((mu_W, log_sigmasq_W, log_mu_Ps ))

    objective = \
        lambda flat_params, t: \
            -1 * elbo(unflatten(flat_params), unpack_W, unpack_Ps, Ys, A, Cs, etasq,
                      num_sinkhorn, num_mcmc_samples, temp_prior, temp)

    # Define a callback to monitor optimization progress
    elbos = []
    lls = []
    mses = []

    num_corrects = []
    times = []

    W_samples = []
    Ps_samples = []

    def collect_stats(params, t):

        if t % 10 == 0:
            W_samples.append([])
            Ps_samples.append([])
            for i in range(100):
                W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs,
                                 num_sinkhorn, temp)
                W_samples[-1].append(W)
                Ps_samples[-1].append(Ps)

        times.append(time.time())
        elbos.append(-1 * objective(params, 0))

        # Sample the variational posterior and compute num correct matches
        mu_W, log_sigmasq_W, log_mu_Ps = unflatten(params)

        W, Ps = sample_q(unflatten(params), unpack_W, unpack_Ps, Cs, 10, 1.0)

        list = []
        for i in range(A.shape[0]):
            list.extend(np.where(Ps[0, i, :] + Ps_true[0, i, :] == 1)[0])

        mses.append(np.mean((W * A - W_true * A)**2))

        # Round doubly stochastic matrix P to the nearest permutation matrix
        num_correct = np.zeros(M)
        Ps2 = np.zeros((Ps.shape[0], A.shape[0], A.shape[0]))
        for m, P in enumerate(Ps):
            row, col = linear_sum_assignment(-P + 1e8 * (1 - Cs[m]))
            Ps2[m] = perm_to_P(col)
            num_correct[m] = n_correct(perm_to_P(col), Ps_true[m])
        num_corrects.append(num_correct)

        lls.append(log_likelihood(Ys, A, W, Ps2, etasq) / (M * T * N))

    def callback(params, t, g):
        collect_stats(params, t)
        print(
            "Iteration {}.  ELBO: {:.4f} LL: {:.4f} MSE(W): {:.4f}, Num Correct: {}"
            .format(t, elbos[-1], lls[-1], mses[-1], num_corrects[-1]))

    # Run optimizer

    callback(flat_params, -1, None)
    variational_params = adam(grad(objective),
                              flat_params,
                              step_size=stepsize,
                              num_iters=num_iters,
                              callback=callback)

    times = np.array(times)
    times -= times[0]


    return times, np.array(elbos), np.array(lls), np.array(mses), \
           np.array(num_corrects), Ps_samples, W_samples, A, W_true
def l2_norm(params):
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)
Beispiel #31
0
def l2_norm(params):
    """Computes l2 norm of params by flattening them into a vector."""
    flattened, _ = flatten(params)
    return np.dot(flattened, flattened)
Beispiel #32
0
log1pexp = primitive(lambda x: np.log1p(np.exp(x)))
log1pexp.defgrad(lambda ans, x: lambda g: g / (1 + np.exp(-x)))
normalize = lambda x: x / np.sum(x, axis=-1, keepdims=True)
softmax = lambda x: normalize(np.exp(x - np.max(x, axis=-1, keepdims=True)))

### misc


def rle(stateseq):
    pos, = np.where(np.diff(stateseq) != 0)
    pos = np.concatenate(([0], pos + 1, [len(stateseq)]))
    return stateseq[pos[:-1]], np.diff(pos)


isarray = lambda x: hasattr(x, 'ndim')
flat = lambda x: flatten(x)[0]
partial_flat = lambda a, axes: np.reshape(a, a.shape[:-axes] + (-1, ))
tensordot = lambda a, b, axes=2: np.dot(partial_flat(a, axes),
                                        partial_flat(b, axes).T)
outer = lambda x, y: x[..., :, None] * y[..., None, :]

### functions and monads


def compose(funcs):
    def composition(x):
        for f in funcs:
            x = f(x)
        return x

    return composition
Beispiel #33
0
def unflatten_tracing():
    val = [npr.randn(4), [npr.randn(3,4), 2.5], (), (2.0, [1.0, npr.randn(2)])]
    vect, unflatten = flatten(val)
    def f(vect): return unflatten(vect)
    flatten2, _ = make_vjp(f)(vect)
    assert np.all(vect == flatten2(val))
 def f(x, y):
     xy, _ = flatten([x, y])
     return np.sum(xy)
 def scalar_args_fun(*new_args):
     full_args = list(args)
     for i, argnum in enumerate(argnums):
         wrt_flat, unflatten = flatten(wrt_args[i])
         full_args[argnum] = unflatten(wrt_flat + new_args[i] * rand_vecs[i])
     return to_scalar(fun(*full_args, **kwargs))
def l2_norm(params) :
    flattened,_ = flatten(params)
    return auto_np.dot(flattened,flattened)