Esempio n. 1
0
 def __init__(self, initial_output='\n'):
     self.theano_rng = MRG_RandomStreams(seed=random.randint(0,100000))
     SoftmaxEmitter.__init__(self, initial_output=initial_output)
Esempio n. 2
0
if not os.path.exists(outfolder):
    os.makedirs(outfolder)
    sample_path = os.path.join(outfolder, 'sample')
    os.makedirs(sample_path)
logfile = os.path.join(outfolder, 'logfile.log')
shutil.copy(os.path.realpath(__file__), os.path.join(outfolder,
                                                     filename_script))

num_labelled = cfg['nlabeled']
ssl_para_seed = cfg['ssl_para_seed']

print('ssl_para_seed %d, num_labelled %d' % (ssl_para_seed, num_labelled))

rng = np.random.RandomState(ssl_para_seed)
theano_rng = MRG_RandomStreams(rng.randint(2**15))
lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15)))

# dataset
data_dir = './data/mnist.pkl.gz'
# flags
valid_flag = False

# pre-train
pre_num_epoch = 0 if num_labelled >= 100 else 30
pre_alpha_unlabeled_entropy = .3
pre_alpha_average = .3
pre_lr = 3e-4
pre_batch_size_lc = min(100, num_labelled)
pre_batch_size_uc = 500
# C
Esempio n. 3
0
 def __init__(self, state, rng, parent):
     EncoderDecoderBase.__init__(self, state, rng, parent)
     self.trng = MRG_RandomStreams(self.seed)
     self.init_params()
Esempio n. 4
0
 def __init__(self, seed=123):
     self.rng = MRG_RandomStreams(seed)
     self.y = self.rng.uniform(size=(1, ))
Esempio n. 5
0
import theano
import theano.tensor as tensor

#global variables to use to toggle training and rng, etc
from theano.sandbox.rng_mrg import MRG_RandomStreams
layer_train_rng = MRG_RandomStreams()
layer_train_enable = tensor.bscalar()
layer_train_epoch = tensor.iscalar()
layer_train_it = theano.shared(0)


def get_rng():
    global layer_train_rng
    return layer_train_rng


def set_rng_seed(v):
    global layer_train_rng
    layer_train_rng.seed(v)


def get_train():
    global layer_train_enable
    return layer_train_enable


def get_epoch():
    global layer_train_epoch
    return layer_train_epoch

Esempio n. 6
0
 def setup_method(self):
     nr.seed(self.random_seed)
     self.old_tt_rng = tt_rng()
     set_tt_rng(MRG_RandomStreams(self.random_seed))
Esempio n. 7
0
def test_binomial():
    #TODO: test size=None, ndim=X
    #TODO: test size=X, ndim!=X.ndim
    #TODO: test random seed in legal value(!=0 and other)
    #TODO: test sample_size not a multiple of guessed #streams
    #TODO: test size=Var, with shape that change from call to call
    #we test size in a tuple of int and a tensor.shape.
    #we test the param p with int.

    if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']
            or mode == 'Mode' and config.linker in ['py']):
        sample_size = (10, 50)
        steps = 50
        rtol = 0.02
    else:
        sample_size = (500, 50)
        steps = int(1e3)
        rtol = 0.01

    x = tensor.matrix()
    v = tensor.vector()
    for mean in [0.1, 0.5]:
        for size, const_size, var_input, input in [
            (sample_size, sample_size, [], []),
            (x.shape, sample_size, [x],
             [numpy.zeros(sample_size, dtype=config.floatX)]),
            ((x.shape[0], sample_size[1]), sample_size, [x],
             [numpy.zeros(sample_size, dtype=config.floatX)]),
                # test empty size (scalar)
            ((), (), [], []),
        ]:

            #print ''
            #print 'ON CPU with size=(%s) and mean(%d):' % (str(size), mean)
            R = MRG_RandomStreams(234, use_cuda=False)
            # Note: we specify `nstreams` to avoid a warning.
            u = R.binomial(size=size,
                           p=mean,
                           nstreams=rng_mrg.guess_n_streams(size, warn=False))
            f = theano.function(var_input, u, mode=mode)
            #theano.printing.debugprint(f)
            out = f(*input)
            #print 'random?[:10]\n', out[0, 0:10]
            #print 'random?[-1,-10:]\n', out[-1, -10:]

            # Increase the number of steps if sizes implies only a few samples
            if numpy.prod(const_size) < 10:
                steps_ = steps * 100
            else:
                steps_ = steps
            basictest(f,
                      steps_,
                      const_size,
                      prefix='mrg  cpu',
                      inputs=input,
                      allow_01=True,
                      target_avg=mean,
                      mean_rtol=rtol)

            if mode != 'FAST_COMPILE' and cuda_available:
                #print ''
                #print 'ON GPU with size=(%s) and mean(%d):' % (str(size), mean)
                R = MRG_RandomStreams(234, use_cuda=True)
                u = R.binomial(size=size,
                               p=mean,
                               dtype='float32',
                               nstreams=rng_mrg.guess_n_streams(size,
                                                                warn=False))
                #well, it's really that this test w GPU doesn't make sense otw
                assert u.dtype == 'float32'
                f = theano.function(
                    var_input,
                    theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(u),
                               borrow=True),
                    mode=mode_with_gpu)
                #theano.printing.debugprint(f)
                gpu_out = numpy.asarray(f(*input))
                #print 'random?[:10]\n', gpu_out[0, 0:10]
                #print 'random?[-1,-10:]\n', gpu_out[-1, -10:]
                basictest(f,
                          steps_,
                          const_size,
                          prefix='mrg  gpu',
                          inputs=input,
                          allow_01=True,
                          target_avg=mean,
                          mean_rtol=rtol)
                numpy.testing.assert_array_almost_equal(out,
                                                        gpu_out,
                                                        decimal=6)

            #print ''
            #print 'ON CPU w NUMPY with size=(%s) and mean(%d):' % (str(size),
            #                                                       mean)
            RR = theano.tensor.shared_randomstreams.RandomStreams(234)

            uu = RR.binomial(size=size, p=mean)
            ff = theano.function(var_input, uu, mode=mode)
            # It's not our problem if numpy generates 0 or 1
            basictest(ff,
                      steps_,
                      const_size,
                      prefix='numpy',
                      allow_01=True,
                      inputs=input,
                      target_avg=mean,
                      mean_rtol=rtol)
Esempio n. 8
0
    def __init__(self, input_var, target_var, n_in, n_out, layers, batch_size,
                 n_batches):
        n_hidden = layers[0]
        n_in = n_in[0]
        # Input to hidden layer weights
        W1_mu = weight_init(n_in, n_hidden, 'W1_mu')  # Weights mean
        W1_log_sigma = weight_init(n_in, n_hidden,
                                   'W1_log_sigma')  # Weights log variance

        # Hidden layer to output weights
        W2_mu = weight_init(n_hidden, n_out, 'W2_mu')  # Weights mean
        W2_log_sigma = weight_init(n_hidden, n_out,
                                   'W2_log_sigma')  # Weights log variance

        # Biases are not random variables (for convenience)
        b1 = theano.shared(value=np.zeros((n_hidden, ),
                                          dtype=theano.config.floatX),
                           name='b1',
                           borrow=True)
        b2 = theano.shared(value=np.zeros((n_out, ),
                                          dtype=theano.config.floatX),
                           name='b2',
                           borrow=True)

        # Network parameters
        params = [W1_mu, W1_log_sigma, W2_mu, W2_log_sigma, b1, b2]

        # Random variables
        srng = MRG_RandomStreams(seed=234)
        rv_hidden = srng.normal(
            (batch_size, n_in, n_hidden))  # Standard normal
        rv_output = srng.normal(
            (batch_size, n_hidden, n_out))  # Standard normal

        # MLP
        # Hidden layer
        #hidden_output = T.nnet.relu(T.batched_dot(input_var, W1_mu + T.log(1.0+T.exp(W1_log_sigma))*rv_hidden) + b1)
        hidden_output = T.nnet.relu(
            T.batched_dot(input_var, W1_mu + T.exp(W1_log_sigma) * rv_hidden) +
            b1)

        # Output layer
        #prediction = T.nnet.softmax(T.batched_dot(hidden_output, W2_mu + T.log(1.0+T.exp(W2_log_sigma))*rv_output) + b2)
        prediction = T.nnet.softmax(
            T.batched_dot(hidden_output, W2_mu +
                          T.exp(W2_log_sigma) * rv_output) + b2)

        # KL divergence between prior and posterior
        # For Gaussian prior and posterior, the formula is exact:
        #DKL_hidden = (1.0 + T.log(2.0*T.log(1.0+T.exp(W1_log_sigma))) - W1_mu**2.0 - 2.0*T.log(1.0+T.exp(W1_log_sigma))).sum()/2.0
        #DKL_output = (1.0 + T.log(2.0*T.log(1.0+T.exp(W2_log_sigma))) - W2_mu**2.0 - 2.0*T.log(1.0+T.exp(W2_log_sigma))).sum()/2.0
        DKL_hidden = (1.0 + 2.0 * W1_log_sigma - W1_mu**2.0 -
                      T.exp(2.0 * W1_log_sigma)).sum() / 2.0
        DKL_output = (1.0 + 2.0 * W2_log_sigma - W2_mu**2.0 -
                      T.exp(2.0 * W2_log_sigma)).sum() / 2.0

        # Negative log likelihood
        nll = T.nnet.categorical_crossentropy(prediction, target_var)
        # Complete variational loss
        loss = nll.mean() - (DKL_hidden + DKL_output) / float(n_batches)
        #loss = nll.mean()
        # ADAM training
        updates = lasagne.updates.adam(loss, params)
        self.train = theano.function([input_var, target_var],
                                     loss,
                                     updates=updates)

        # Test functions
        hidden_output_test = T.nnet.relu(T.dot(input_var, W1_mu) + b1)
        test_prediction = T.nnet.softmax(T.dot(hidden_output_test, W2_mu) + b2)
        test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var))
        self.test = theano.function([input_var, target_var], [loss, test_acc])
        self.pred = theano.function([input_var], test_prediction)

        # Probability and entropy
        self.probabilities = theano.function([input_var], prediction)
        entropy = T.nnet.categorical_crossentropy(prediction, prediction)
        self.entropy_bayesian = theano.function([input_var], entropy)
        # Fake deterministic entropy to make the code modular (this should not be used for comparisons)
        self.entropy_deterministic = theano.function([input_var],
                                                     0.0 * input_var.sum())
Esempio n. 9
0
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)

    std = tensor.scalar()
    out = srng.normal((), avg=0, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)

    out = srng.normal((), avg=avg, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (avg, std))


def test_f16_nonzero(mode=None, op_to_check=rng_mrg.mrg_uniform):
    srng = MRG_RandomStreams(seed=utt.fetch_seed())
    m = srng.uniform(size=(1000, 1000), dtype='float16')
    assert m.dtype == 'float16', m.type
    f = theano.function([], m, mode=mode)
    assert any(
        isinstance(n.op, op_to_check) for n in f.maker.fgraph.apply_nodes)
    m_val = f()
    assert np.all((0 < m_val) & (m_val < 1))


if __name__ == "__main__":
    rng = MRG_RandomStreams(np.random.randint(2147462579))
    print(theano.__file__)
    pvals = theano.tensor.fmatrix()
    for i in range(10):
        t0 = time.time()
        multinomial = rng.multinomial(pvals=pvals)
        print(time.time() - t0)
Esempio n. 10
0
def makemodel(
    name="ADGM" if ADGM else "SDGM",
    nls=["rectify"] * 2,
    seed=seed,
    descenter=G.Adam,
    K=K,
    L=L,
):
    #$ tensor_shapes
    """
  Creates the ADGM or SDGM model.

  Xl has dimension (Nl, 1, 1, 1, X)
  Xu has dimension (Nu, 1, 1, 1, X)
  Yl has dimension (Nl, 1, 1, 1, Y)
  Yu has dimension ( 1, 1, 1, Y, Y)
  EAl has dimension (Nl, K, 1, 1, A)
  EAu has dimension (Nu, K, 1, 1, A)
  EZl has dimension (Nl, K, L, 1, Z)
  EZu has dimension (Nu, K, L, Y, Z)
  Al will have dimension (Nl, K, 1, 1, A)
  Au will have dimension (Nu, K, 1, 1, A)
  Zl will have dimension (Nl, K, L, 1, Z)
  Zu will have dimension (Nu, K, L, Y, Z)
  """
    #$

    Print = Log("../log/{}".format(name), "w", quiet=True)
    model = Model(name=name,
                  shuffledata=shuffledata,
                  thresholddata=thresholdX,
                  normalizedata=normalizeX,
                  seed=seed,
                  maxvar=highvaronly)
    model.Print = Print
    model.loadmomentum = loadmomentum
    model.descenter = descenter(gradnorm)
    networks = OrderedDict()
    rng = MRG_RandomStreams()

    X = model.XCols

    model.constants = OrderedDict([
        ("                    ", model.name),
        ("shuffle data?", shuffledata),
        ("data seed", model.seed),
        ("Nu", Nu),
        ("Nl", Nl),
        ("X", X),
        ("Y", Y),
        ("Z", Z),
        ("A", A),
        ("L", L),
        ("K", K),
        ("Kt", Kt),
        ("aJL", aJL),
        ("aJU", aJU),
        ("aJA", aJA),
        ("aJW", aJW),
        ("gradient norm?", gradnorm),
        ("std. normal A?", Anormal),
        ("A to Z?", AtoZ),
        ("gaussian X?", gaussianX),
        ("sample X?", sampleX),
        ("threshold X?", thresholdX),
        ("normalize X?", normalizeX),
        ("high var only?", highvaronly),
        ("NSaves", NSaves),
        ("enable save?", enablesave),
        ("combolength", combolength),
        ("load momentum?", loadmomentum),
        ("juggle momentum?", jugglemomentum),
        ("random juggler?", randomjuggler),
        ("epsilon", epsilon),
    ])
    for name, val in model.constants.items():
        model.Print("{:>20s}".format(name), val)

    #$ px_stack
    # Create the networks for px
    ins = [Y, Z] if ADGM else [A, Y, Z]
    last = "linear" if gaussianX else "sigmoid"
    O = [X, X] if gaussianX else [X]
    fx = Stack(insizes=ins, outsizes=O, hidnls=nls, lastnl=last)
    networks["fx"] = fx
    #$

    #$ pa_stack
    # Create the networks for pa
    ins = [X, Y, Z] if ADGM else [Y, Z]
    fa = Stack(insizes=ins, outsizes=[A, A], hidnls=nls)
    if not Anormal:
        networks["fa"] = fa
    #$

    #$ qz_stack
    # Create the networks for qz
    ins = [A, X, Y] if AtoZ else [X, Y]
    fz = Stack(insizes=ins, outsizes=[Z, Z], hidnls=nls)
    networks["fz"] = fz
    #$

    #$ qax_stack
    # Create the networks for qax
    ins = [X]
    fax = Stack(insizes=ins, outsizes=[A, A], hidnls=nls)
    networks["fax"] = fax
    #$

    #$ qy_stack
    # Create the network for qy.  Outputs are
    # probabilities, so last layer is always
    # softmax.
    ins = [A, X]
    last = "softmax"
    fy = Stack(insizes=ins, outsizes=[Y], hidnls=nls, lastnl=last)
    networks["fy"] = fy
    #$

    #$ model.networks
    # Collect all of the parameters together
    # so we can optimize the objectives with
    # respect to them.
    model.networks = networks
    model.params = []
    for name, net in model.networks.items():
        model.Print("{:>20s}".format(name), net)
        model.params += net.params
    #$

    # For now, throw an error if Nl or Nu are
    # not specified.
    # Eventually, we would like to be able to
    # handle only Nl, only Nu, or both Nl and Nu.
    if Nl is None or Nu is None:
        raise ValueError("Need to specify Nl and Nu")

    #$ shared_inputs
    # Xl, Ylh, and Xu are shared variables on the
    # GPU.  For Xu, we take random batch slices.
    # We assume for now that all (Xl,Yl) are used
    # in each batch.
    Xl2 = model.Xl[:Nl]
    Yl2 = model.Ylh[:Nl]

    bidxs = rng.uniform((Nu, )) * model.Xu.shape[0]
    bidxs = T.cast(bidxs, "int32")
    Xu2 = model.Xu[bidxs]
    #$

    #$ sampleX
    # If X is binary, then sample it on each
    # minibatch.  This idea borrowed from Maaloe's
    # code.  Not sure if it helps.
    #
    # Keep track of Xl2s, Yl2, and Xu2s so we can
    # do theano variable substitution later.
    if not gaussianX and sampleX:
        Xl2s = rng.binomial(n=1,
                            p=Xl2,
                            size=Xl2.shape,
                            dtype=theano.config.floatX)
        Xu2s = rng.binomial(n=1,
                            p=Xu2,
                            size=Xu2.shape,
                            dtype=theano.config.floatX)
    else:
        Xl2s = Xl2
        Xu2s = Xu2
    #$

    #$ dimshuffled
    # Reshape the labeled set matrices
    # to 5th-order tensors.
    Xl = Xl2s.dimshuffle([0, "x", "x", "x", 1])
    Yl = Yl2.dimshuffle([0, "x", "x", "x", 1])

    # Xu is known, but Yu is not known.
    # Create one possible Y per class.
    Xu = Xu2s.dimshuffle([0, "x", "x", "x", 1])
    Yu = T.eye(Y, Y).dimshuffle(["x", "x", "x", 0, 1])
    #$

    #$ noises
    # EZ and EA will be used to approximate
    # the integrals using L samples for Z and
    # K samples for A.
    #
    # Create shared variables for K and L so we
    # can do variable substitutions later.
    K = theano.shared(K, name="samplesA")
    L = theano.shared(L, name="samplesZ")
    EAl = rng.normal((Xl.shape[0], K, 1, 1, A))
    EAu = rng.normal((Xu.shape[0], K, 1, 1, A))
    EZl = rng.normal((Xl.shape[0], K, L, 1, Z))
    EZu = rng.normal((Xu.shape[0], K, L, Y, Z))
    #$

    # Assign inputs to the model.
    # We assume that all data is already on the GPU.
    # Furthermore, we create functions that
    # evaluate the objectives on the test data
    # directly.  Therefore, there are no inputs
    # needed for calling the training function.
    model.inputs = []

    #$ al_au
    # Find the latent variables.
    # Note that multiplying by E effectively tiles
    # all latent variables L or K times.
    #
    # Auxiliary A has to be found first
    # because latent Z is a function of it.
    muaxl, sdaxl = fax([Xl])
    muaxu, sdaxu = fax([Xu])
    Al = muaxl + T.exp(sdaxl) * EAl
    Au = muaxu + T.exp(sdaxu) * EAu
    #$

    #$ zl_zu
    # Compute Z.
    inputl = [Al, Xl, Yl] if AtoZ else [Xl, Yl]
    inputu = [Au, Xu, Yu] if AtoZ else [Xu, Yu]
    muzl, sdzl = fz(inputl)
    muzu, sdzu = fz(inputu)
    Zl = muzl + T.exp(sdzl) * EZl
    Zu = muzu + T.exp(sdzu) * EZu
    #$

    #$ muxl_muxu
    # Find the reconstruction means and
    # standard deviations.
    # Note: sdxl and sdxu are used only if
    #       gaussian is True.  The binary case
    #       ignores those.
    # If ADGM, then X is a function of YZ.
    # If SDGM, then X is a function of AYZ.
    inputl = [Yl, Zl] if ADGM else [Al, Yl, Zl]
    inputu = [Yu, Zu] if ADGM else [Au, Yu, Zu]
    if gaussianX:
        muxl, sdxl = fx(inputl)
        muxu, sdxu = fx(inputu)
    else:
        muxl = fx(inputl)
        muxu = fx(inputu)
    #$

    #$ mual_muau
    # Find mu and sd for A in the generative
    # (reconstruction) direction.
    # If ADGM, then A depends on XYZ.
    # If SDGM, then A depends on YZ.
    inputl = [Xl, Yl, Zl] if ADGM else [Yl, Zl]
    inputu = [Xu, Yu, Zu] if ADGM else [Yu, Zu]
    mual, sdal = fa(inputl)
    muau, sdau = fa(inputu)
    #$

    #$ JL_1
    # Find the component probabilities and the
    # labeled objective, JL.
    l_pz = loggauss(Zl)
    l_qz = loggauss(Zl, muzl, sdzl)

    l_py = T.log(1.0 / Y)

    if gaussianX:
        l_px = loggauss(Xl, muxl, sdxl)
    else:
        l_px = logbernoulli(Xl, muxl)
    #$

    #$ JL_2
    # In Maaloe's first revision, A is disconnected
    # in the generative model, so we assume it
    # to be standard normal.
    #
    # In the more updated version, A is fed into
    # by X, Y, and Z.
    # In SDGM, A is generated by Z and Y.
    normal = zero if Anormal else one
    l_pa = loggauss(Al, normal * mual, normal * sdal)
    l_qa = loggauss(Al, muaxl, sdaxl)
    #$

    #$ JL_3
    JL = l_qz + l_qa
    JL = JL - l_px - l_py - l_pz - l_pa
    JL = batchaverage(exA(exZ(JL)))
    JL = aJL * JL
    #$

    #$ JU_1
    # Find the component probabilities and the
    # unlabeled objective, JU.

    # The output of fy(Au, Xu) is pi.
    # (Nu, K, 1, 1, Y)
    # We need to relocate the last axis.
    # (Nu, K, 1, Y, 1)
    inputu = [Au, Xu]
    pi = fy(inputu).dimshuffle([0, 1, "x", 4, "x"])
    #$

    #$ JU_2
    u_pz = loggauss(Zu)
    u_qz = loggauss(Zu, muzu, sdzu)

    u_py = T.log(1.0 / Y)
    u_qy = T.log(pi)

    u_pa = loggauss(Au, normal * muau, normal * sdau)
    u_qa = loggauss(Au, muaxu, sdaxu)

    if gaussianX:
        u_px = loggauss(Xu, muxu, sdxu)
    else:
        u_px = logbernoulli(Xu, muxu)
    #$

    #$ JU_3
    JU = u_qz + u_qa + u_qy
    JU = JU - u_px - u_py - u_pz - u_pa
    JU = batchaverage(exA(classsum(exZ(JU), pi)))
    JU = aJU * JU
    #$

    #$ JA
    # Make sure that the known labels are correctly
    # assigned.
    # Yl has dimension (Nl, 1, 1, 1, Y)
    # Al,Xl has dimension (Nl, K, 1, 1, A+X)
    # fy(Al,Xl) is (Nl, K, 1, 1, Y)
    #
    # Yl is one-hot.
    # Multiply by Yl and perform a sum over
    # Y to get the one probability out, then neg
    # log it, average it over K, and
    # average it over N.
    inputl = [Al, Xl]
    JA = batchaverage(exA(-T.log(T.sum(fy(inputl) * Yl, axis=-1))))
    JA = aJA * JA
    #$

    # Regularize the weight matrices of the
    # networks so they do not stray far from zero.
    # Copied from Maaloe's github code.
    JW = zero
    for p in model.params:
        if 'W' not in str(p):
            continue
        JW += T.mean(p**two)
    JW = aJW * JW

    JCombined = JL + JU + JA + JW

    # Stick the objectives into the model.
    model.objective = JCombined

    #$ prediction_comments
    # Create a function for predictions!
    # We need to evaluate a bunch of values for A,
    # so Xt is an N by X dimensional matrix and
    # Et is a K by A dimensional matrix.
    # Reshape Xt to (N, 1, X) and
    #         Et to (1, K, A).
    #
    # Then, At = fmuax(Xt) + Et*fsdax(Xt)
    # and has a dimension of (N, K, A).
    #
    # Class probabilities pi are fy(AXt)
    # and have shape (N, K, Y).  Take their
    # log, average over K, then argmax over Y
    # to find class predictions.
    #$

    #$ prediction_function
    Xt2 = T.matrix("Xt")
    Et2 = rng.normal((Kt, A))
    Xt = Xt2.dimshuffle([0, "x", 1])
    Et = Et2.dimshuffle(["x", 0, 1])
    muat, sdat = fax([Xt])
    At = muat + T.exp(sdat) * Et

    inputt = [At, Xt]
    prediction = T.argmax(T.mean(T.log(fy(inputt)), axis=1), axis=-1)

    predict = theano.function(inputs=[Xt2],
                              outputs=prediction,
                              allow_input_downcast=True)
    model.predict = predict
    #$

    #$ classification
    Yt = T.ivector("Yt")
    accuracyT = T.eq(Yt, prediction).mean(dtype=theano.config.floatX)

    model.accuracyT = theano.function(inputs=[],
                                      outputs=accuracyT,
                                      givens={
                                          Xt2: model.Xt,
                                          Yt: model.Yt
                                      },
                                      allow_input_downcast=True)
    #$

    model.accuracyL = theano.function(inputs=[],
                                      outputs=accuracyT,
                                      givens={
                                          Xt2: model.Xl,
                                          Yt: model.Yl
                                      },
                                      allow_input_downcast=True)

    # Create a stats function that outputs
    # extra information.
    model.adds = [
        JL,
        JU,
        JA,
        JW,
        T.mean(l_qa),
        T.mean(u_qa),
        T.mean(u_qy),
        T.mean(l_qz),
        T.mean(u_qz),
        -T.mean(l_px.max(axis=AxisY)),
        -T.mean(u_px.max(axis=AxisY)),
        -T.mean(l_pa),
        -T.mean(u_pa),
    ]
    model.headings = [
        "J",
        "JL",
        "JU",
        "JA",
        "JW",
        "l q(a)",
        "u q(a)",
        "u q(y)",
        "l q(z)",
        "u q(z)",
        "l -p(x)",
        "u -p(x)",
        "l -p(a)",
        "u -p(a)",
    ]
    model.outputs = [model.objective] + model.adds
    model.stats = theano.function(inputs=[],
                                  outputs=model.outputs,
                                  givens={
                                      Xl2s: model.Xl[:1000],
                                      Yl2: model.Ylh[:1000],
                                      Xu2s: model.Xu[:1000],
                                      K: 1
                                  },
                                  allow_input_downcast=True)

    return model
Esempio n. 11
0
    def __init__(self, 
        glimpse_shape, glimpse_times, 
        dim_hidden, dim_fc, dim_out, 
        reward_base, 
        rng_std=1.0, activation=T.tanh, bptt_truncate=-1, 
        lmbd=0.1 # gdupdate + lmbd*rlupdate
        ): 
        if reward_base == None: 
            reward_base = np.zeros((glimpse_times)).astype('float32')
            reward_base[-1] = 1.0
        x = T.ftensor3('x')  # N * W * H 
        y = T.ivector('y')  # label 
        lr = T.fscalar('lr')
        reward_base = theano.shared(name='reward_base', value=np.array(reward_base).astype(theano.config.floatX), borrow=True) # Time (vector)
        reward_bias = T.fvector('reward_bias')
        rng = MRG_RandomStreams(np.random.randint(9999999))
#       rng = theano.tensor.shared_randomstreams.RandomStreams(np.random.randint(9999999))
    
        i = InputLayer(x)
        au = AttentionUnit(x, glimpse_shape, glimpse_times, dim_hidden, rng, rng_std, activation, bptt_truncate)
#       All hidden states are put into decoder
#       layers = [i, au, InputLayer(au.output[:,:,:].flatten(2))]
#       dim_fc = [glimpse_times*dim_hidden] + dim_fc + [dim_out]
#       Only the last hidden states
        layers = [i, au, InputLayer(au.output[:,-1,:])]
        dim_fc = [dim_hidden] + dim_fc + [dim_out]
        for Idim, Odim in zip(dim_fc[:-1], dim_fc[1:]):
            fc = FullConnectLayer(layers[-1].output, Idim, Odim, activation, 'FC')
            layers.append(fc)
        sm = SoftmaxLayer(layers[-1].output)
        layers.append(sm)

        output = sm.output       # N * classes 
        hidoutput = au.output    # N * dim_output 
        location = au.location   # N * T * dim_hidden
        prediction = output.argmax(1) # N

        # calc
        equalvec = T.eq(prediction, y) # [0, 1, 0, 0, 1 ...]
        correct = T.cast(T.sum(equalvec), 'float32')
#       noequalvec = T.neq(prediction, y)
#       nocorrect = T.cast(T.sum(noequalvec), 'float32')
        logLoss = T.log(output)[T.arange(y.shape[0]), y] # 
        reward_biased = T.outer(equalvec, reward_base)-reward_bias.dimshuffle('x', 0)
            # N * Time
            # (R_t - b_t), where b = E[R]
        
        # gradient descent
        gdobjective = logLoss.sum()/x.shape[0]  # correct * dim_output (only has value on the correctly predicted sample)
        gdparams = reduce(lambda x, y: x+y.params, layers, []) 
        gdupdates = map(lambda x: (x, x+lr*T.grad(gdobjective, x)), gdparams)

        # reinforce learning
        rlobjective = (reward_biased.dimshuffle(0, 1, 'x') * T.log(au.location_p)).sum() / x.shape[0]
            # location_p: N * Time * 2
            # location_logp: N * Time
            # reward_biased: N * 2
        rlparams = au.reinforceParams 
        rlupdates = map(lambda x: (x, x+lr*lmbd*T.grad(rlobjective, x)), rlparams)

        # Hidden state keeps unchange in time
        deltas = T.stack(*[((au.output[:,i,:].mean(0)-au.output[:,i+1,:].mean(0))**2).sum()  for i in xrange(glimpse_times-1)])
            # N * Time * dim_hidden
         
        print 'compile step()'
        self.step = theano.function([x, y, lr, reward_bias], [gdobjective, rlobjective, correct, T.outer(equalvec, reward_base)], updates=gdupdates+rlupdates)
    #       print 'compile gdstep()'
    #       self.gdstep = theano.function([x, y, lr], [gdobjective, correct, location], updates=gdupdates)
    #       print 'compile rlstep()'
    #       self.rlstep = theano.function([x, y, lr], [rlobjective], updates=rlupdates)
        print 'compile predict()'
        self.predict = theano.function([x], prediction)
#       print 'compile forward()'
#       self.forward = theano.function([x], map(lambda x: x.output, layers)) #[layers[-3].output, fc.output])
#       print 'compile error()'
#       self.error = theano.function([x, y], gdobjective)
        print 'compile locate()'
        self.locate = theano.function([x], [au.location_mean, location]) #[layers[-3].output, fc.output])
        print 'compile debug()'
        self.debug = theano.function([x, y, lr, reward_bias], [deltas, au.location_p], on_unused_input='warn')

        # self.xxx
        self.glimpse_times = glimpse_times
Esempio n. 12
0
from theano.sandbox.rng_mrg import MRG_RandomStreams

'''
    whether to use Xavier initialization, as described in
        http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
'''
USE_XAVIER_INIT = False


'''
    Defaut random generators
'''
#random.seed(5817)
default_rng = np.random.RandomState(random.randint(0,9999))
#default_srng = T.shared_randomstreams.RandomStreams(default_rng.randint(9999))
default_mrng = MRG_RandomStreams(default_rng.randint(9999))
default_srng = default_mrng

'''
    Activation functions
'''
ReLU = lambda x: x * (x > 0)
sigmoid = T.nnet.sigmoid
tanh = T.tanh
softmax = T.nnet.softmax
linear = lambda x: x

def get_activation_by_name(name):
    if name.lower() == "relu":
        return ReLU
    elif name.lower() == "sigmoid":
Esempio n. 13
0
    ----------
    gen: generator that implements __next__ (py3) or next (py2) method
        and yields np.arrays with same types
    default: np.array with the same type as generator produces

    Returns
    -------
    TensorVariable
        It has 2 new methods
        - var.set_gen(gen): sets new generator
        - var.set_default(value): sets new default value (None erases default value)
    """
    return GeneratorOp(gen, default)()


_tt_rng = MRG_RandomStreams()


def tt_rng(random_seed=None):
    """
    Get the package-level random number generator or new with specified seed.

    Parameters
    ----------
    random_seed: int
        If not None
        returns *new* theano random generator without replacing package global one

    Returns
    -------
    `theano.sandbox.rng_mrg.MRG_RandomStreams` instance
Esempio n. 14
0
def test_match_grad_valid_conv():

    # Tests that weightActs is the gradient of FilterActs
    # with respect to the weights.

    for partial_sum in [0, 1, 4]:
        rng = np.random.RandomState([2012, 10, 9])

        batch_size = 3
        rows = 7
        cols = 9
        channels = 8
        filter_rows = 4
        filter_cols = filter_rows
        num_filters = 16

        images = shared(rng.uniform(
            -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                        name='images')
        filters = shared(rng.uniform(-1., 1.,
                                     (channels, filter_rows, filter_cols,
                                      num_filters)).astype('float32'),
                         name='filters')

        gpu_images = gpu_from_host(images)
        gpu_filters = gpu_from_host(filters)

        output = FilterActs(partial_sum=partial_sum)(gpu_images, gpu_filters)
        output = host_from_gpu(output)

        images_bc01 = images.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

        output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

        output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

        theano_rng = MRG_RandomStreams(2013 + 1 + 31)

        coeffs = theano_rng.normal(avg=0.,
                                   std=1.,
                                   size=output_conv2d.shape,
                                   dtype='float32')

        cost_conv2d = (coeffs * output_conv2d).sum()

        weights_grad_conv2d = T.grad(cost_conv2d, filters)

        cost = (coeffs * output).sum()
        hid_acts_grad = T.grad(cost, output)

        weights_grad = host_from_gpu(
            WeightActs(partial_sum=partial_sum)(gpu_images,
                                                gpu_from_host(hid_acts_grad)))

        f = function(
            [], [output, output_conv2d, weights_grad, weights_grad_conv2d])

        output, output_conv2d, weights_grad, weights_grad_conv2d = f()

        if np.abs(output - output_conv2d).max() > 8e-6:
            assert type(output) == type(output_conv2d)
            assert output.dtype == output_conv2d.dtype
            if output.shape != output_conv2d.shape:
                print 'cuda-convnet shape: ', output.shape
                print 'theano shape: ', output_conv2d.shape
                assert False
            err = np.abs(output - output_conv2d)
            print 'absolute error range: ', (err.min(), err.max())
            print 'mean absolute error: ', err.mean()
            print 'cuda-convnet value range: ', (output.min(), output.max())
            print 'theano value range: ', (output_conv2d.min(),
                                           output_conv2d.max())
            assert False

        warnings.warn(
            """test_match_grad_valid_conv success criterion is not very strict. Can we verify that this is OK?
                         One possibility is that theano is numerically unstable and Alex's code is better.
                         Probably theano CPU 64 bit is OK but it's worth checking the others."""
        )

        if np.abs(weights_grad - weights_grad_conv2d).max() > 8.6e-6:
            if type(weights_grad) != type(weights_grad_conv2d):
                raise AssertionError("weights_grad is of type " +
                                     str(weights_grad))
            assert weights_grad.dtype == weights_grad_conv2d.dtype
            if weights_grad.shape != weights_grad_conv2d.shape:
                print 'cuda-convnet shape: ', weights_grad.shape
                print 'theano shape: ', weights_grad_conv2d.shape
                assert False
            err = np.abs(weights_grad - weights_grad_conv2d)
            print 'absolute error range: ', (err.min(), err.max())
            print 'mean absolute error: ', err.mean()
            print 'cuda-convnet value range: ', (weights_grad.min(),
                                                 weights_grad.max())
            print 'theano value range: ', (weights_grad_conv2d.min(),
                                           weights_grad_conv2d.max())
            assert False
Esempio n. 15
0
        Each column corresponds to a different unit

    Returns:
        dW: a matrix of the derivatives of the expected gradient
            of the energy
    """

    return T.grad(energy(W, V, H).mean(), W, consider_constant=[V, H])


if __name__ == "__main__":
    m = 2
    nv = 3
    nh = 4
    h0 = T.alloc(1., m, nh)
    rng_factory = MRG_RandomStreams(42)
    W = rng_factory.normal(size=(nv, nh), dtype=h0.dtype)
    pv = T.nnet.sigmoid(T.dot(h0, W.T))
    v = rng_factory.binomial(p=pv, size=pv.shape, dtype=W.dtype)
    ph = T.nnet.sigmoid(T.dot(v, W))
    h = rng_factory.binomial(p=ph, size=ph.shape, dtype=W.dtype)

    class _ElemwiseNoGradient(theano.tensor.Elemwise):
        def grad(self, inputs, output_gradients):
            raise TypeError("You shouldn't be differentiating through "
                    "the sampling process.")
            return [ theano.gradient.DisconnectedType()() ]
    block_gradient = _ElemwiseNoGradient(theano.scalar.identity)

    v = block_gradient(v)
    h = block_gradient(h)
Esempio n. 16
0
def test_normal0():

    steps = 50
    std = 2.
    if (config.mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']
            or config.mode == 'Mode' and config.linker in ['py']):
        sample_size = (25, 30)
        default_rtol = .02
    else:
        sample_size = (999, 50)
        default_rtol = .01
    sample_size_odd = (sample_size[0], sample_size[1] - 1)
    x = tensor.matrix()

    for size, const_size, var_input, input, avg, rtol, std_tol in [
        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
        (x.shape, sample_size, [x],
         [np.zeros(sample_size,
                   dtype=config.floatX)], -5., default_rtol, default_rtol),
            # test odd value
        (x.shape, sample_size_odd, [x],
         [np.zeros(sample_size_odd,
                   dtype=config.floatX)], -5., default_rtol, default_rtol),
        (sample_size, sample_size, [], [],
         np.arange(np.prod(sample_size), dtype='float32').reshape(sample_size),
         10. * std / np.sqrt(steps), default_rtol),
            # test empty size (scalar)
        ((), (), [], [], -5., default_rtol, 0.02),
            # test with few samples at the same time
        ((1, ), (1, ), [], [], -5., default_rtol, 0.02),
        ((3, ), (3, ), [], [], -5., default_rtol, 0.02),
    ]:

        R = MRG_RandomStreams(234)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size,
                     avg=avg,
                     std=std,
                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n)
        f(*input)

        # Increase the number of steps if size implies only a few samples
        if np.prod(const_size) < 10:
            steps_ = steps * 50
        else:
            steps_ = steps
        basictest(f,
                  steps_,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='mrg ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol,
                  std_tol=std_tol)

        sys.stdout.flush()

        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

        nn = RR.normal(size=size, avg=avg, std=std)
        ff = theano.function(var_input, nn)

        basictest(ff,
                  steps_,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='numpy ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol)
Esempio n. 17
0
    def ready(self):
        encoder = self.encoder
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = encoder.dropout

        # len*batch
        x = self.x = encoder.x
        z = self.z = encoder.z

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(2):
            if layer_type == "rcnn":
                l = RCNN(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation,
                    order=args.order)
            elif layer_type == "lstm":
                l = LSTM(
                    n_in=n_e,  # if i == 0 else n_d,
                    n_out=n_d,
                    activation=activation)
            layers.append(l)

        # len * batch
        #masks = T.cast(T.neq(x, padding_id), theano.config.floatX)
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, "x"))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        flipped_embs = embs[::-1]

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h2 = layers[1].forward_all(flipped_embs)
        h_final = T.concatenate([h1, h2[::-1]], axis=2)
        h_final = apply_dropout(h_final, dropout)
        size = n_d * 2

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        probs2 = probs.reshape(x.shape)
        self.MRG_rng = MRG_RandomStreams()
        z_pred = self.z_pred = T.cast(
            self.MRG_rng.binomial(size=probs2.shape, p=probs2), "int8")

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        self.z_pred = theano.gradient.disconnected_grad(z_pred)

        z2 = z.dimshuffle((0, 1, "x"))
        logpz = -T.nnet.binary_crossentropy(probs, z2) * masks
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                      axis=0,
                      dtype=theano.config.floatX)

        loss_mat = encoder.loss_mat
        if args.aspect < 0:
            loss_vec = T.mean(loss_mat, axis=1)
        else:
            assert args.aspect < self.nclasses
            loss_vec = loss_mat[:, args.aspect]
        self.loss_vec = loss_vec

        coherent_factor = args.sparsity * args.coherent
        loss = self.loss = T.mean(loss_vec)
        sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
                                             T.mean(zdiff) * coherent_factor
        cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
        cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
        self.obj = T.mean(cost_vec)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg

        cost = self.cost = cost_logpz * 10 + l2_cost
        print "cost.dtype", cost.dtype

        self.cost_e = loss * 10 + encoder.l2_cost
Esempio n. 18
0
def test_undefined_grad():
    srng = MRG_RandomStreams(seed=1234)

    # checking uniform distribution
    low = tensor.scalar()
    out = srng.uniform((), low=low)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, low)

    high = tensor.scalar()
    out = srng.uniform((), low=0, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, high)

    out = srng.uniform((), low=low, high=high)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (low, high))

    # checking binomial distribution
    prob = tensor.scalar()
    out = srng.binomial((), p=prob)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, prob)

    # checking multinomial distribution
    prob1 = tensor.scalar()
    prob2 = tensor.scalar()
    p = [theano.tensor.as_tensor_variable([prob1, 0.5, 0.25])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), prob1)

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.multinomial(size=None, pvals=p, n=4)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad,
                  theano.tensor.sum(out), (prob1, prob2))

    # checking choice
    p = [theano.tensor.as_tensor_variable([prob1, prob2, 0.1, 0.2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, prob2])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  (prob1, prob2))

    p = [theano.tensor.as_tensor_variable([prob1, 0.2, 0.3])]
    out = srng.choice(a=None, size=1, p=p, replace=False)[0]
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out[0],
                  prob1)

    # checking normal distribution
    avg = tensor.scalar()
    out = srng.normal((), avg=avg)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, avg)

    std = tensor.scalar()
    out = srng.normal((), avg=0, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out, std)

    out = srng.normal((), avg=avg, std=std)
    assert_raises(theano.gradient.NullTypeGradError, theano.grad, out,
                  (avg, std))
Esempio n. 19
0
def test_uniform():
    #TODO: test param low, high
    #TODO: test size=None
    #TODO: test ndim!=size.ndim
    #TODO: test bad seed
    #TODO: test size=Var, with shape that change from call to call
    if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']
            or mode == 'Mode' and config.linker in ['py']):
        sample_size = (10, 100)
        steps = 50
    else:
        sample_size = (500, 50)
        steps = int(1e3)

    x = tensor.matrix()
    for size, const_size, var_input, input in [
        (sample_size, sample_size, [], []),
        (x.shape, sample_size, [x],
         [numpy.zeros(sample_size, dtype=config.floatX)]),
        ((x.shape[0], sample_size[1]), sample_size, [x],
         [numpy.zeros(sample_size, dtype=config.floatX)]),
            # test empty size (scalar)
        ((), (), [], []),
    ]:

        #### TEST CPU IMPLEMENTATION ####
        # The python and C implementation are tested with DebugMode
        #print ''
        #print 'ON CPU with size=(%s):' % str(size)
        x = tensor.matrix()
        R = MRG_RandomStreams(234, use_cuda=False)
        # Note: we specify `nstreams` to avoid a warning.
        # TODO Look for all occurrences of `guess_n_streams` and `30 * 256`
        # for such situations: it would be better to instead filter the
        # warning using the warning module.
        u = R.uniform(size=size,
                      nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, u, mode=mode)
        assert any([
            isinstance(node.op, theano.sandbox.rng_mrg.mrg_uniform)
            for node in f.maker.fgraph.toposort()
        ])
        #theano.printing.debugprint(f)
        cpu_out = f(*input)

        #print 'CPU: random?[:10], random?[-10:]'
        #print cpu_out[0, 0:10]
        #print cpu_out[-1, -10:]

        # Increase the number of steps if sizes implies only a few samples
        if numpy.prod(const_size) < 10:
            steps_ = steps * 100
        else:
            steps_ = steps
        basictest(f, steps_, const_size, prefix='mrg cpu', inputs=input)

        if mode != 'FAST_COMPILE' and cuda_available:
            #print ''
            #print 'ON GPU with size=(%s):' % str(size)
            R = MRG_RandomStreams(234, use_cuda=True)
            u = R.uniform(size=size,
                          dtype='float32',
                          nstreams=rng_mrg.guess_n_streams(size, warn=False))
            # well, it's really that this test w GPU doesn't make sense otw
            assert u.dtype == 'float32'
            f = theano.function(
                var_input,
                theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(u),
                           borrow=True),
                mode=mode_with_gpu)
            assert any([
                isinstance(node.op, theano.sandbox.rng_mrg.GPU_mrg_uniform)
                for node in f.maker.fgraph.toposort()
            ])
            #theano.printing.debugprint(f)
            gpu_out = numpy.asarray(f(*input))

            #print 'GPU: random?[:10], random?[-10:]'
            #print gpu_out[0, 0:10]
            #print gpu_out[-1, -10:]
            basictest(f, steps_, const_size, prefix='mrg  gpu', inputs=input)

            numpy.testing.assert_array_almost_equal(cpu_out,
                                                    gpu_out,
                                                    decimal=6)

        #print ''
        #print 'ON CPU w Numpy with size=(%s):' % str(size)
        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

        uu = RR.uniform(size=size)
        ff = theano.function(var_input, uu, mode=mode)
        # It's not our problem if numpy generates 0 or 1
        basictest(ff,
                  steps_,
                  const_size,
                  prefix='numpy',
                  allow_01=True,
                  inputs=input)
Esempio n. 20
0
def _pokemon_wgan_gp():
    import os
    os.environ["FUEL_DATA_PATH"] = os.getcwd() + "/data/"
    batch_size = 20
    data_train = PokemonGenYellowNormal(which_sets=['train'],
                                        sources=['features'])

    train_stream = Flatten(DataStream.default_stream(
        data_train, iteration_scheme=SequentialScheme(
            data_train.num_examples, batch_size)))

    features_size = 56 * 56 * 1

    inits = {
        'weights_init': IsotropicGaussian(0.01),
        'biases_init': Constant(0.)
    }

    # print train_stream.get_epoch_iterator(as_dict=True).next()
    # raise

    inputs = T.matrix('features')
    inputs = ((inputs / 255.) * 2. - 1.)

    rng = MRG_RandomStreams(123)

    prior = Z_prior(dim=512)
    gen = Generator(input_dim=512, dims=[512, 512, 512, 512,
                                         features_size],
                    alpha=0.1, **inits)

    dis = Discriminator(dims=[features_size, 512, 512 , 512, 512],
                        alpha=0.1, **inits)

    gan = GAN(dis=dis, gen=gen, prior=prior)
    gan.initialize()

    # gradient penalty
    fake_samples, _ = gan.sampling(inputs.shape[0])
    e = rng.uniform(size=(inputs.shape[0], 1))

    mixed_input = (e * fake_samples) + (1 - e) * inputs

    output_d_mixed = gan._dis.apply(mixed_input)

    grad_mixed = T.grad(T.sum(output_d_mixed), mixed_input)

    norm_grad_mixed = T.sqrt(T.sum(T.square(grad_mixed), axis=1))
    grad_penalty = T.mean(T.square(norm_grad_mixed -1))

    y_hat1, y_hat0, z = gan.apply(inputs)

    d_loss_real = y_hat1.mean()
    d_loss_fake = y_hat0.mean()
    d_loss = - d_loss_real + d_loss_fake + 10 * grad_penalty
    g_loss = - d_loss_fake


    dis_obj = d_loss
    gen_obj = g_loss

    model = Model([y_hat0, y_hat1])

    em_loss = -d_loss_real + d_loss_fake

    em_loss.name = "Earth Move loss"
    dis_obj.name = 'Discriminator loss'
    gen_obj.name = 'Generator loss'

    cg = ComputationGraph([gen_obj, dis_obj])

    gen_filter = VariableFilter(roles=[PARAMETER],
                                bricks=gen.linear_transformations)

    dis_filter = VariableFilter(roles=[PARAMETER],
                                bricks=dis.linear_transformations)

    gen_params = gen_filter(cg.variables)
    dis_params = dis_filter(cg.variables)

# Prepare the dropout
    _inputs = []
    for brick_ in [gen]:
        _inputs.extend(VariableFilter(roles=[INPUT],
                    bricks=brick_.linear_transformations)(cg.variables))

    cg_dropout = apply_dropout(cg, _inputs, 0.02)

    gen_obj = cg_dropout.outputs[0]
    dis_obj = cg_dropout.outputs[1]

    gan.dis_params = dis_params
    gan.gen_params = gen_params

    # gradient penalty

    algo = AdverserialTraning(gen_obj=gen_obj, dis_obj=dis_obj,
                              model=gan, dis_iter=5, gradient_clip=None,
                              step_rule=RMSProp(learning_rate=1e-4),
                              gen_consider_constant=z)

    neg_sample = gan.sampling(size=25)

    from blocks.monitoring.aggregation import mean

    monitor = TrainingDataMonitoring(variables=[mean(gen_obj), mean(dis_obj),
                                                mean(em_loss)],
                                     prefix="train", after_batch=True)

    subdir = './exp/' + 'pokemon-wgan-gp' + "-" + time.strftime("%Y%m%d-%H%M%S")

    check_point = Checkpoint("{}/{}".format(subdir, 'CIFAR10'),
                                every_n_epochs=100,
                                save_separately=['log', 'model'])

    neg_sampling = GenerateNegtiveSample(neg_sample,
                                         img_size=(25, 56, 56),
                                         every_n_epochs=10)

    if not os.path.exists(subdir):
        os.makedirs(subdir)

    main_loop = MainLoop(algorithm=algo, model=model,
                         data_stream=train_stream,
                         extensions=[Printing(), ProgressBar(), monitor,
                                     check_point, neg_sampling])

    main_loop.run()
Esempio n. 21
0
def test_normal0():

    steps = 50
    std = 2.
    if (mode in ['DEBUG_MODE', 'DebugMode', 'FAST_COMPILE']
            or mode == 'Mode' and config.linker in ['py']):
        sample_size = (25, 30)
        default_rtol = .02
    else:
        sample_size = (999, 50)
        default_rtol = .01
    sample_size_odd = (sample_size[0], sample_size[1] - 1)
    x = tensor.matrix()

    for size, const_size, var_input, input, avg, rtol, std_tol in [
        (sample_size, sample_size, [], [], -5., default_rtol, default_rtol),
        (x.shape, sample_size, [x],
         [numpy.zeros(sample_size,
                      dtype=config.floatX)], -5., default_rtol, default_rtol),
        ((x.shape[0], sample_size[1]), sample_size, [x],
         [numpy.zeros(sample_size,
                      dtype=config.floatX)], -5., default_rtol, default_rtol),
            #test odd value
        (sample_size_odd, sample_size_odd, [], [], -5., default_rtol,
         default_rtol),
            #test odd value
        (x.shape, sample_size_odd, [x],
         [numpy.zeros(sample_size_odd,
                      dtype=config.floatX)], -5., default_rtol, default_rtol),
        (sample_size, sample_size, [], [],
         numpy.arange(numpy.prod(sample_size),
                      dtype='float32').reshape(sample_size),
         10. * std / numpy.sqrt(steps), default_rtol),
            # test empty size (scalar)
        ((), (), [], [], -5., default_rtol, 0.02),
            # test with few samples at the same time
        ((1, ), (1, ), [], [], -5., default_rtol, 0.02),
        ((2, ), (2, ), [], [], -5., default_rtol, 0.02),
        ((3, ), (3, ), [], [], -5., default_rtol, 0.02),
    ]:
        #print ''
        #print 'ON CPU:'

        R = MRG_RandomStreams(234, use_cuda=False)
        # Note: we specify `nstreams` to avoid a warning.
        n = R.normal(size=size,
                     avg=avg,
                     std=std,
                     nstreams=rng_mrg.guess_n_streams(size, warn=False))
        f = theano.function(var_input, n, mode=mode)
        #theano.printing.debugprint(f)
        out = f(*input)
        #print 'random?[:10]\n', out[0, 0:10]

        # Increase the number of steps if size implies only a few samples
        if numpy.prod(const_size) < 10:
            steps_ = steps * 50
        else:
            steps_ = steps
        basictest(f,
                  steps_,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='mrg ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol,
                  std_tol=std_tol)

        sys.stdout.flush()

        if mode != 'FAST_COMPILE' and cuda_available:
            #print ''
            #print 'ON GPU:'
            R = MRG_RandomStreams(234, use_cuda=True)
            n = R.normal(size=size,
                         avg=avg,
                         std=std,
                         dtype='float32',
                         nstreams=rng_mrg.guess_n_streams(size, warn=False))
            #well, it's really that this test w GPU doesn't make sense otw
            assert n.dtype == 'float32'
            f = theano.function(
                var_input,
                theano.Out(theano.sandbox.cuda.basic_ops.gpu_from_host(n),
                           borrow=True),
                mode=mode_with_gpu)

            #theano.printing.debugprint(f)
            sys.stdout.flush()
            gpu_out = numpy.asarray(f(*input))
            #print 'random?[:10]\n', gpu_out[0, 0:10]
            #print '----'
            sys.stdout.flush()
            basictest(f,
                      steps_,
                      const_size,
                      target_avg=avg,
                      target_std=std,
                      prefix='gpu mrg ',
                      allow_01=True,
                      inputs=input,
                      mean_rtol=rtol,
                      std_tol=std_tol)
            # Need to allow some rounding error as their is float
            # computation that are done on the gpu vs cpu
            assert numpy.allclose(out, gpu_out, rtol=5e-6, atol=5e-6)

        #print ''
        #print 'ON CPU w NUMPY:'
        RR = theano.tensor.shared_randomstreams.RandomStreams(234)

        nn = RR.normal(size=size, avg=avg, std=std)
        ff = theano.function(var_input, nn)

        basictest(ff,
                  steps_,
                  const_size,
                  target_avg=avg,
                  target_std=std,
                  prefix='numpy ',
                  allow_01=True,
                  inputs=input,
                  mean_rtol=rtol)
Esempio n. 22
0
def test_gaussian_vis_layer_sample_conv():
    """
    Verifies that GaussianVisLayer.sample returns an expression
    whose value passes check_gaussian_samples.

    In this case the layer lives in a Conv2DSpace

    """
    assert hasattr(np, 'exp')

    n = None
    num_samples = 1000
    tol = .042  # tolerated variance
    beta = 1 / tol  # precision parameter
    rows = 3
    cols = 3
    channels = 3
    # axes for batch, rows, cols, channels, can be given in any order
    axes = ['b', 0, 1, 'c']
    random.shuffle(axes)
    axes = tuple(axes)
    print('axes:', axes)

    class DummyLayer(object):
        """
        A layer that we build for the test that just uses a state
        as its downward message.
        """
        def downward_state(self, state):
            return state

        def downward_message(self, state):
            return state

    vis = GaussianVisLayer(nvis=None,
                           rows=rows,
                           cols=cols,
                           channels=channels,
                           init_beta=beta,
                           axes=axes)
    hid = DummyLayer()

    rng = np.random.RandomState([2012, 11, 1, 259])

    mean = rng.uniform(1e-6, 1. - 1e-6, (rows, cols, channels))

    ofs = rng.randn(rows, cols, channels)

    vis.set_biases(ofs.astype(config.floatX))

    #z = inverse_sigmoid_numpy(mean) - ofs
    z = mean - ofs

    z_var = sharedX(np.zeros((num_samples, rows, cols, channels)) + z)

    theano_rng = MRG_RandomStreams(2012 + 11 + 1)

    sample = vis.sample(state_above=z_var,
                        layer_above=hid,
                        theano_rng=theano_rng)

    sample = sample.eval()

    check_gaussian_samples(sample, num_samples, n, rows, cols, channels, mean,
                           tol)
Esempio n. 23
0
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams

from mozi.layers.template import Template
from mozi.weight_init import GaussianWeight
from mozi.utils.theano_utils import shared_zeros

floatX = theano.config.floatX
theano_rand = MRG_RandomStreams()


class VariationalAutoencoder(Template):
    def __init__(self,
                 input_dim,
                 bottlenet_dim,
                 z_dim,
                 weight_init=GaussianWeight(mean=0, std=0.01)):

        self.input_dim = input_dim
        self.bottlenet_dim = bottlenet_dim

        # encoder
        self.W_e = weight_init((input_dim, bottlenet_dim), name='W_e')
        self.b_e = shared_zeros(shape=bottlenet_dim, name='b_e')
        self.W_miu = weight_init((bottlenet_dim, z_dim), name='W_miu')
        self.b_miu = shared_zeros(shape=z_dim, name='b_miu')
        self.W_sig = weight_init((bottlenet_dim, z_dim), name='W_sig')
        self.b_sig = shared_zeros(shape=z_dim, name='b_sig')
        # decoder
        self.W1_d = weight_init((z_dim, bottlenet_dim), name='W1_d')
Esempio n. 24
0
def test_bvmp_mf_sample_consistent():

    # A test of the BinaryVectorMaxPool class
    # Verifies that the mean field update is consistent with
    # the sampling function

    # Specifically, in a DBM consisting of (v, h1, h2), the
    # lack of intra-layer connections means that
    # P(h1|v, h2) is factorial so mf_update tells us the true
    # conditional.
    # We can thus use mf_update to compute the expected value
    # of a sample of h1 from v and h2, and check that samples
    # drawn using the layer's sample method convert to that
    # value.

    rng = np.random.RandomState([2012, 11, 1, 1016])
    theano_rng = MRG_RandomStreams(2012 + 11 + 1 + 1036)
    num_samples = 1000
    tol = .042

    def do_test(pool_size_1):

        # Make DBM and read out its pieces
        dbm = make_random_basic_binary_dbm(
            rng=rng,
            pool_size_1=pool_size_1,
        )

        v = dbm.visible_layer
        h1, h2 = dbm.hidden_layers

        num_p = h1.get_output_space().dim

        # Choose which unit we will test
        p_idx = rng.randint(num_p)

        # Randomly pick a v, h1[-p_idx], and h2 to condition on
        # (Random numbers are generated via dbm.rng)
        layer_to_state = dbm.make_layer_to_state(1)
        v_state = layer_to_state[v]
        h1_state = layer_to_state[h1]
        h2_state = layer_to_state[h2]

        # Debugging checks
        num_h = h1.detector_layer_dim
        assert num_p * pool_size_1 == num_h
        pv, hv = h1_state
        assert pv.get_value().shape == (1, num_p)
        assert hv.get_value().shape == (1, num_h)

        # Infer P(h1[i] | h2, v) using mean field
        expected_p, expected_h = h1.mf_update(
            state_below=v.upward_state(v_state),
            state_above=h2.downward_state(h2_state),
            layer_above=h2)

        expected_p = expected_p[0, :]
        expected_h = expected_h[0, :]

        expected_p, expected_h = function([], [expected_p, expected_h])()

        # copy all the states out into a batch size of num_samples
        cause_copy = sharedX(np.zeros((num_samples, ))).dimshuffle(0, 'x')
        v_state = v_state[0, :] + cause_copy
        p, h = h1_state
        h1_state = (p[0, :] + cause_copy, h[0, :] + cause_copy)
        p, h = h2_state
        h2_state = (p[0, :] + cause_copy, h[0, :] + cause_copy)

        h1_samples = h1.sample(state_below=v.upward_state(v_state),
                               state_above=h2.downward_state(h2_state),
                               layer_above=h2,
                               theano_rng=theano_rng)

        h1_samples = function([], h1_samples)()

        check_bvmp_samples(h1_samples, num_samples, num_h, pool_size,
                           (expected_p, expected_h), tol)

    # 1 is an important corner case
    # We must also run with a larger number to test the general case
    for pool_size in [1, 2, 5]:
        do_test(pool_size)
Esempio n. 25
0
"""High-level modular Theano-based network components."""

from collections import OrderedDict
from functools import partial

import numpy as np
import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams

from spinn.util import NUM_TRANSITION_TYPES


numpy_random = np.random.RandomState(1234)
theano_random = MRG_RandomStreams(numpy_random.randint(999999))


def UniformInitializer(range):
    return lambda shape, **kwargs: np.random.uniform(-range, range, shape)

def HeKaimingInitializer():
    def HeKaimingInit(shape, real_shape=None):
        # Calculate fan-in / fan-out using real shape if given as override
        fan = real_shape or shape

        return np.random.normal(scale=np.sqrt(4.0/(fan[0] + fan[1])),
                                size=shape)
    return HeKaimingInit


def NormalInitializer(std):
Esempio n. 26
0
def test_DBN(finetune_lr, pretraining_epochs, pretrain_lr, cdk, usepersistent,
             training_epochs, L1_reg, L2_reg, hidden_layers_sizes, dataset,
             batch_size, output_folder, shuffle, scaling, dropout, first_layer,
             dumppath):
    """
    Demonstrates how to train and test a Deep Belief Network.

    :type finetune_lr: float
    :param finetune_lr: learning rate used in the finetune stage
    :type pretraining_epochs: int
    :param pretraining_epochs: number of epoch to do pretraining
    :type pretrain_lr: float
    :param pretrain_lr: learning rate to be used during pre-training
    :type cdk: int
    :param cdk: number of Gibbs steps in CD/PCD
    :type training_epochs: int
    :param training_epochs: maximal number of iterations ot run the optimizer
    :type dataset: string
    :param dataset: path the the pickled dataset
    :type batch_size: int
    :param batch_size: the size of a minibatch
    """
    print locals()

    datasets = loadmat(dataset=dataset,
                       shuffle=shuffle,
                       datasel=datasel,
                       scaling=scaling,
                       robust=robust,
                       h5py=1)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    print "%d training examples" % train_set_x.get_value(borrow=True).shape[0]
    print "%d feature dimensions" % train_set_x.get_value(borrow=True).shape[1]

    # numpy random generator
    numpy_rng = numpy.random.RandomState(123)
    print '... building the model'
    # construct the Deep Belief Network
    nclass = max(train_set_y.eval()) + 1
    dbn = DBN(numpy_rng=numpy_rng,
              n_ins=train_set_x.get_value(borrow=True).shape[1],
              hidden_layers_sizes=hidden_layers_sizes,
              n_outs=nclass,
              L1_reg=L1_reg,
              L2_reg=L2_reg,
              first_layer=first_layer)
    print 'n_ins:%d' % train_set_x.get_value(borrow=True).shape[1]
    print 'n_outs:%d' % nclass

    # SP contains an ordered list of (pos), ordered by chord class number [0,ydim-1]
    SP = balanced_seg.balanced(nclass, train_set_y)

    # getting pre-training and fine-tuning functions
    # save images of the weights(receptive fields) in this output folder
    # if not os.path.isdir(output_folder):
    # os.makedirs(output_folder)
    # os.chdir(output_folder)

    print '... getting the pretraining functions'
    pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,
                                                batch_size=batch_size,
                                                cdk=cdk,
                                                usepersistent=usepersistent)
    # get the training, validation and testing function for the model
    print '... getting the finetuning functions'

    train_fn, train_model, validate_model, test_model = dbn.build_finetune_functions(
        datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr)

    trng = MRG_RandomStreams(1234)
    use_noise = theano.shared(numpy.asarray(0., dtype=theano.config.floatX))
    if dropout:
        # dbn.x = dropout_layer(use_noise, dbn.x, trng, 0.8)
        for i in range(dbn.n_layers):
            dbn.sigmoid_layers[i].output = dropout_layer(
                use_noise, dbn.sigmoid_layers[i].output, trng, 0.5)

    # start-snippet-2
    #########################
    # PRETRAINING THE MODEL #
    #########################

    print '... pre-training the model'
    plotting_time = 0.
    start_time = timeit.default_timer()
    ## Pre-train layer-wise
    for i in xrange(dbn.n_layers):
        # go through pretraining epochs
        for epoch in xrange(pretraining_epochs):
            if pretrain_dropout:
                use_noise.set_value(1.)  # use dropout at pre-training
            # go through the training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(pretraining_fns[i](index=batch_index, lr=pretrain_lr))
            print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
            print numpy.mean(c)
            '''
            for j in range(dbn.n_layers):
                if j == 0:
                    # Plot filters after each training epoch
                    plotting_start = timeit.default_timer()
                    # Construct image from the weight matrix
                    this_layer = dbn.rbm_layers[j]
                    this_field = this_layer.W.get_value(borrow=True).T
                    print "field shape (%d,%d)"%this_field.shape
                    image = Image.fromarray(
                        tile_raster_images(
                            X=this_field[0:100], # take only the first 100 fields (100 * n_visible)
                            #the img_shape and tile_shape depends on n_visible and n_hidden of this_layer
                            # if n_visible = 144 (12,12), if n_visible = 1512 (36,42)
                            img_shape=(12, 12),
                            tile_shape=(10, 10),
                            tile_spacing=(1, 1)
                        )
                    )
                    image.save('filters_at_epoch_%i.png' % epoch)
                    plotting_stop = timeit.default_timer()
                    plotting_time += (plotting_stop - plotting_start)
            '''

    end_time = timeit.default_timer()
    # end-snippet-2
    print >> sys.stderr, ('The pretraining code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((end_time - start_time) / 60.))
    ########################
    # FINETUNING THE MODEL #
    ########################

    print '... finetuning the model'
    # early-stopping parameters
    patience = 10 * n_train_batches  # look as this many examples regardless
    patience_increase = 2.  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.999  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatches before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0

    # while (epoch < training_epochs) and (not done_looping):
    while (epoch < training_epochs):
        if earlystop and done_looping:
            print 'early-stopping'
            break
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            use_noise.set_value(1.)  # use dropout at training time
            # FIXME: n_train_batches is a fake item
            bc_idx = balanced_seg.get_bc_idx(SP, nclass)
            minibatch_avg_cost = train_fn(bc_idx)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                use_noise.set_value(0.)  # stop dropout at validation/test time
                validation_losses = validate_model()
                training_losses = train_model()
                this_validation_loss = numpy.mean(validation_losses)
                this_training_loss = numpy.mean(training_losses)

                # also monitor the training losses
                print('epoch %i, minibatch %i/%i, training error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_training_loss * 100.))

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    with open(dumppath, "wb") as f:
                        cPickle.dump(dbn.params, f)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    '''
                    # test it on the test set
                    test_losses = test_model()
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))
                    '''

            if patience <= iter:
                done_looping = True
                if earlystop:
                    break

    end_time = timeit.default_timer()
    print(('Optimization complete with best validation score of %f %%, '
           'obtained at iteration %i, '
           'with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The fine tuning code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((end_time - start_time) / 60.))
Esempio n. 27
0
def apply_adaptive_noise(
    computation_graph,
    cost,
    variables,
    num_examples,
    parameters=None,
    init_sigma=1e-6,
    model_cost_coefficient=1.0,
    seed=None,
    gradients=None,
):
    """Add adaptive noise to parameters of a model.

    Each of the given variables will be replaced by a normal
    distribution with learned mean and standard deviation.

    A model cost is computed based on the precision of the the distributions
    associated with each variable. It is added to the given cost used to
    train the model.

    See: A. Graves "Practical Variational Inference for Neural Networks",
         NIPS 2011

    Parameters
    ----------
    computation_graph : instance of :class:`ComputationGraph`
        The computation graph.
    cost : :class:`~tensor.TensorVariable`
        The cost without weight noise. It should be a member of the
        computation_graph.
    variables : :class:`~tensor.TensorVariable`
        Variables to add noise to.
    num_examples : int
        Number of training examples. The cost of the model is divided by
        the number of training examples, please see
        A. Graves "Practical Variational Inference for Neural Networks"
        for justification
    parameters : list of :class:`~tensor.TensorVariable`
        parameters of the model, if gradients are given the list will not
        be used. Otherwise, it will be used to compute the gradients
    init_sigma : float,
        initial standard deviation of noise variables
    model_cost_coefficient : float,
        the weight of the model cost
    seed : int, optional
        The seed with which
        :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` is initialized,
        is set to 1 by default.
    gradients : dict, optional
        Adaptive weight noise introduces new parameters for which new cost
        and gradients must be computed. Unless the gradients paramter is
        given, it will use theano.grad to get the gradients
    Returns
    -------

    cost : :class:`~tensor.TensorVariable`
        The new cost
    computation_graph : instance of :class:`ComputationGraph`
        new graph with added noise.
    gradients : dict
        a dictionary of gradients for all parameters: the original ones
        and the adaptive noise ones
    noise_brick : :class:~lvsr.graph.NoiseBrick
        the brick that holds all noise parameters and whose .apply method
        can be used to find variables added by adaptive noise
    """
    if not seed:
        seed = config.default_seed
    rng = MRG_RandomStreams(seed)

    try:
        cost_index = computation_graph.outputs.index(cost)
    except ValueError:
        raise ValueError("cost is not part of the computation_graph")

    if gradients is None:
        if parameters is None:
            raise ValueError("Either gradients or parameters must be given")
        logger.info("Taking the cost gradient")
        gradients = dict(equizip(parameters, tensor.grad(cost, parameters)))
    else:
        if parameters is not None:
            logger.warn("Both gradients and parameters given, will ignore"
                        "parameters")
        parameters = gradients.keys()

    gradients = OrderedDict(gradients)

    log_sigma_scale = 2048.0

    P_noisy = variables  # We will add noise to these
    Beta = []  # will hold means, log_stdev and stdevs
    P_with_noise = []  # will hold parames with added noise

    # These don't change
    P_clean = list(set(parameters).difference(P_noisy))

    noise_brick = NoiseBrick()

    for p in P_noisy:
        p_u = p
        p_val = p.get_value(borrow=True)
        p_ls2 = theano.shared(
            (numpy.zeros_like(p_val) +
             numpy.log(init_sigma) * 2. / log_sigma_scale).astype(
                 dtype=numpy.float32))
        p_ls2.name = __get_name(p_u)
        noise_brick.parameters.append(p_ls2)
        p_s2 = tensor.exp(p_ls2 * log_sigma_scale)
        Beta.append((p_u, p_ls2, p_s2))

        p_noisy = p_u + rng.normal(size=p_val.shape) * tensor.sqrt(p_s2)
        p_noisy = tensor.patternbroadcast(p_noisy, p.type.broadcastable)
        P_with_noise.append(p_noisy)

    #  compute the prior mean and variation
    temp_sum = 0.0
    temp_param_count = 0.0
    for p_u, unused_p_ls2, unused_p_s2 in Beta:
        temp_sum = temp_sum + p_u.sum()
        temp_param_count = temp_param_count + p_u.shape.prod()

    prior_u = tensor.cast(temp_sum / temp_param_count, 'float32')

    temp_sum = 0.0
    for p_u, unused_ls2, p_s2 in Beta:
        temp_sum = temp_sum + (p_s2).sum() + (((p_u - prior_u)**2).sum())

    prior_s2 = tensor.cast(temp_sum / temp_param_count, 'float32')

    #  convert everything to use the noisy parameters
    full_computation_graph = ComputationGraph(computation_graph.outputs +
                                              gradients.values())
    full_computation_graph = full_computation_graph.replace(
        dict(zip(P_noisy, P_with_noise)))

    LC = 0.0  # model cost
    for p_u, p_ls2, p_s2 in Beta:
        LC = (LC + 0.5 *
              ((tensor.log(prior_s2) - p_ls2 * log_sigma_scale).sum()) + 1.0 /
              (2.0 * prior_s2) *
              (((p_u - prior_u)**2) + p_s2 - prior_s2).sum())

    LC = LC / num_examples * model_cost_coefficient

    train_cost = noise_brick.apply(
        full_computation_graph.outputs[cost_index].copy(), LC, prior_u,
        prior_s2)

    gradients = OrderedDict(
        zip(gradients.keys(),
            full_computation_graph.outputs[-len(gradients):]))

    #
    # Delete the gradients form the computational graph
    #
    del full_computation_graph.outputs[-len(gradients):]

    new_grads = {p: gradients.pop(p) for p in P_clean}

    #
    # Warning!!!
    # This only works for batch size 1 (we want that the sum of squares
    # be the square of the sum!
    #
    diag_hessian_estimate = {p: g**2 for p, g in gradients.iteritems()}

    for p_u, p_ls2, p_s2 in Beta:
        p_grad = gradients[p_u]
        p_u_grad = (model_cost_coefficient * (p_u - prior_u) /
                    (num_examples * prior_s2) + p_grad)

        p_ls2_grad = (
            numpy.float32(model_cost_coefficient * 0.5 / num_examples *
                          log_sigma_scale) * (p_s2 / prior_s2 - 1.0) +
            (0.5 * log_sigma_scale) * p_s2 * diag_hessian_estimate[p_u])
        new_grads[p_u] = p_u_grad
        new_grads[p_ls2] = p_ls2_grad

    return train_cost, full_computation_graph, new_grads, noise_brick
Esempio n. 28
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 L1_reg=0,
                 L2_reg=0,
                 first_layer='grbm',
                 model=None):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.L1 = 0
        self.L2_sqr = 0

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = MRG_RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
        # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[i - 1].output

            if model is None:
                W = None
                b = None
            else:
                W = model[i * 2]
                b = model[i * 2 + 1]

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        W=W,
                                        b=b,
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            self.L1 += (abs(sigmoid_layer.W).sum())
            self.L2_sqr += ((sigmoid_layer.W**2).sum())

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            if i == 0:  # first layer GBRBM - dealing with continous value
                if first_layer == 'grbm':
                    rbm_layer = GRBM(numpy_rng=numpy_rng,
                                     theano_rng=theano_rng,
                                     input=layer_input,
                                     n_visible=input_size,
                                     n_hidden=hidden_layers_sizes[i],
                                     W=sigmoid_layer.W,
                                     hbias=sigmoid_layer.b)
                if first_layer == 'rbm':
                    rbm_layer = RBM(numpy_rng=numpy_rng,
                                    theano_rng=theano_rng,
                                    input=layer_input,
                                    n_visible=input_size,
                                    n_hidden=hidden_layers_sizes[i],
                                    W=sigmoid_layer.W,
                                    hbias=sigmoid_layer.b)
            # elif i == self.n_layers-1: # last layer GGRBM
            # rbm_layer = GRBM(numpy_rng=numpy_rng,
            # theano_rng=theano_rng,
            # input=layer_input,
            # n_visible=input_size,
            # n_hidden=hidden_layers_sizes[i],
            # W=sigmoid_layer.W,
            # hbias=sigmoid_layer.b)
            else:  # subsequence layers BBRBM - binary RBM to cope with regularization
                rbm_layer = RBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        if model is None:
            W = None
            b = None
        else:
            W = model[-2]
            b = model[-1]
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            W=W,
            b=b,
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        self.L1 += (abs(self.logLayer.W).sum())

        self.L2_sqr += ((self.logLayer.W**2).sum())

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = (self.logLayer.negative_log_likelihood(self.y) +
                              +L1_reg * self.L1 + L2_reg * self.L2_sqr)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
        self.predprobs = self.logLayer.p_y_given_x
        self.preds = self.logLayer.y_pred
Esempio n. 29
0
def apply_dropout(computation_graph,
                  variables,
                  drop_prob,
                  rng=None,
                  seed=None):
    """Returns a graph to variables in a computational graph.

    Parameters
    ----------
    computation_graph : instance of :class:`ComputationGraph`
        The computation graph.
    variables : list of :class:`~tensor.TensorVariable`
        Variables to be dropped out.
    drop_prob : float
        Probability of dropping out. If you want to apply the dropout
        with different probabilities for different layers, call it
        several times.
    rng : :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams`
        Random number generator.
    seed : int
        Random seed to be used if `rng` was not specified.

    Notes
    -----
    For more information, see [DROPOUT]_.

    .. [DROPOUT] Hinton et al. *Improving neural networks by preventing
       co-adaptation of feature detectors*, arXiv:1207.0580.

    Examples
    --------
    >>> import numpy
    >>> from theano import tensor, function
    >>> from blocks.bricks import MLP, Identity
    >>> from blocks.filter import VariableFilter
    >>> from blocks.initialization import Constant
    >>> from blocks.roles import INPUT
    >>> linear = MLP([Identity(), Identity()], [2, 10, 2],
    ...              weights_init=Constant(1), biases_init=Constant(2))
    >>> x = tensor.matrix('x')
    >>> y = linear.apply(x)
    >>> cg = ComputationGraph(y)

    We are going to drop out all the input variables

    >>> inputs = VariableFilter(roles=[INPUT])(cg.variables)

    Here we apply dropout with default setting to our computation graph

    >>> cg_dropout = apply_dropout(cg, inputs, 0.5)

    Dropped out variables have role `DROPOUT` and are tagged with
    `replacement_of` tag. Let's filter these variables and check if they
    have the links to original ones.

    >>> dropped_out = VariableFilter(roles=[DROPOUT])(cg_dropout.variables)
    >>> inputs_referenced = [var.tag.replacement_of for var in dropped_out]
    >>> set(inputs) == set(inputs_referenced)
    True

    Compiling theano functions to forward propagate in original and dropped
    out graphs

    >>> fprop = function(cg.inputs, cg.outputs[0])
    >>> fprop_dropout = function(cg_dropout.inputs, cg_dropout.outputs[0])

    Initialize an MLP and apply these functions

    >>> linear.initialize()
    >>> fprop(numpy.ones((3, 2),
    ...       dtype=theano.config.floatX))  # doctest:+ELLIPSIS
    array([[ 42.,  42.],
           [ 42.,  42.],
           [ 42.,  42.]]...
    >>> fprop_dropout(numpy.ones((3, 2),
    ...               dtype=theano.config.floatX))  # doctest:+ELLIPSIS
    array([[ 0.,  0.],
           [ 0.,  0.],
           [ 0.,  0.]]...

    And after the second run answer is different

    >>> fprop_dropout(numpy.ones((3, 2),
    ...               dtype=theano.config.floatX))  # doctest:+ELLIPSIS
    array([[   0.,   52.],
           [ 100.,    0.],
           [   0.,    0.]]...

    """
    if not rng and not seed:
        seed = config.default_seed
    if not rng:
        rng = MRG_RandomStreams(seed)

    replacements = [
        (var, var * rng.binomial(var.shape, p=1 - drop_prob, dtype=floatX) /
         (1 - drop_prob)) for var in variables
    ]
    for variable, replacement in replacements:
        add_role(replacement, DROPOUT)
        replacement.tag.replacement_of = variable

    return computation_graph.replace(replacements)
Esempio n. 30
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        layer_type = args.layer.lower()
        for i in xrange(1):
            l = CNN(n_in=n_e,
                    n_out=n_d,
                    activation=activation,
                    order=args.order)
            layers.append(l)

        # len * batch
        masks = T.cast(T.neq(x, padding_id), "int8").dimshuffle((0, 1, 'x'))

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)
        self.word_embs = embs

        # len*bacth*n_d
        h1 = layers[0].forward_all(embs)
        h_final = h1
        size = n_d
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=1,
                                                 activation=sigmoid)

        # len*batch*1
        probs = output_layer.forward(h_final)

        # len*batch
        self.MRG_rng = MRG_RandomStreams()
        z_pred_dim3 = self.MRG_rng.binomial(size=probs.shape,
                                            p=probs,
                                            dtype="int8")
        z_pred = z_pred_dim3.reshape(x.shape)

        # we are computing approximated gradient by sampling z;
        # so should mark sampled z not part of the gradient propagation path
        #
        z_pred = self.z_pred = theano.gradient.disconnected_grad(z_pred)
        print "z_pred", z_pred.ndim

        #logpz = - T.nnet.binary_crossentropy(probs, z_pred_dim3) * masks
        logpz = -T.nnet.binary_crossentropy(probs, z_pred_dim3)
        logpz = self.logpz = logpz.reshape(x.shape)
        probs = self.probs = probs.reshape(x.shape)

        # batch
        z = z_pred
        self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
        self.zdiff = T.sum(T.abs_(z[1:] - z[:-1]),
                           axis=0,
                           dtype=theano.config.floatX)

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost