def __init__(self, input, n_in, n_out):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie

        """

        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        self.W = theano.shared(value=numpy.zeros((n_in, n_out),
                                                 dtype=theano.config.floatX),
                                name='W', borrow=True)
        # initialize the baises b as a vector of n_out 0s
        self.b = theano.shared(value=numpy.zeros((n_out,),
                                                 dtype=theano.config.floatX),
                               name='b', borrow=True)

        # compute vector of class-membership probabilities in symbolic form
        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)

        # compute prediction as class whose probability is maximal in
        # symbolic form
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)

        # parameters of the model
        self.params = [self.W, self.b]
    def architecture(self, cons, code_layer):
        """Build up the architecture by theano"""
        for i in range(len(self.layers)-1):
            # Initialize shared variables
            init_w = cons*np.random.randn(self.layers[i], self.layers[i+1])
            self.weights.append(th.shared(init_w))
            init_bias = cons*np.random.randn(self.layers[i+1])
            self.biases.append(th.shared(init_bias))

            # Building architecture
            a_before = T.dot(self.a_n[i], self.weights[i]) + \
                self.biases[i].dimshuffle('x', 0)
            a_next = self.activ(a_before)
            self.a_n.append(a_next)

        # help the optimization
        for param in (self.weights+self.biases):
            self.auxiliary.append(th.shared(np.zeros(param.get_value().shape)))

        self.encode = th.function([self.x], self.a_n[code_layer])
        self.decode = th.function([self.a_n[code_layer]], self.a_n[-1])

        # Calculate the cost and gradients
        Cost = (T.sum((self.a_n[-1]-self.y_hat)**2))/self.batch
        params = self.weights + self.biases
        grads = T.grad(Cost, params, disconnected_inputs='ignore')

        # Update parameters
        update_query = self.update(params, grads, self.auxiliary)
        self.gradient_2 = th.function(inputs=[self.x, self.y_hat],
                                      updates=update_query, outputs=Cost)
Example #3
0
    def __init__(self, x, y, in_size, out_size, prefix='lr_'):

        self.W = theano.shared(
            value=np.random.uniform(
                low=-np.sqrt(6. / (in_size + out_size)),
                high=np.sqrt(6. / (in_size + out_size)),
                size=(in_size, out_size)
            ).astype(theano.config.floatX),
            name='W',
            borrow=True
        )

        self.b = theano.shared(
            value=np.random.uniform(
                low=-np.sqrt(6. / (in_size + out_size)),
                high=np.sqrt(6. / (in_size + out_size)),
                size=(out_size,)
            ).astype(theano.config.floatX),
            name='b',
            borrow=True
        )

        self.y_given_x = T.nnet.softmax(T.dot(x, self.W) + self.b)

        self.y_d = T.argmax(self.y_given_x, axis=1)

        self.loss = -T.mean(T.log(self.y_given_x)[T.arange(y.shape[0]), y])

        self.error = T.mean(T.neq(self.y_d, y))

        self.params = {prefix+'W': self.W, prefix+'b': self.b}
Example #4
0
def adam(loss, all_params, learn_rate=0.001, b1=0.9, b2=0.999, e=1e-8, gamma=1-1e-8):
    """ADAM update rules

    Kingma, Diederik, and Jimmy Ba. "Adam: A Method for Stochastic Optimization." arXiv preprint arXiv:1412.6980 (2014). http://arxiv.org/pdf/1412.6980v4.pdf
    """
    updates = []
    all_grads = theano.grad(loss, all_params)
    alpha = learn_rate
    t = theano.shared(np.float32(1.))
    b1_t = b1 * gamma ** (t - 1.)   # decay the first moment running average coefficient

    for theta_prev, g in zip(all_params, all_grads):
        m_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX))
        v_prev = theano.shared(np.zeros(theta_prev.get_value().shape, dtype=theano.config.floatX))

        m = b1_t * m_prev + (1. - b1_t) * g  # update biased first moment estimate
        v = b2 * v_prev + (1. - b2) * g ** 2  # update biased second raw moment estimate
        m_hat = m / (1. - b1 ** t)  # compute bias-corrected first moment estimate
        v_hat = v / (1. - b2 ** t)  # compute bias-corrected second raw moment estimate
        theta = theta_prev - (alpha * m_hat) / (T.sqrt(v_hat) + e)  # update parameters

        updates.append((m_prev, m))
        updates.append((v_prev, v))
        updates.append((theta_prev, theta) )
    updates.append((t, t + 1.))
    return updates
Example #5
0
    def __init__(self, n_in, n_out, W_init=None, b_init=None,
                 activation=T.tanh):
        self.activation = activation
        if W_init is None:
            rng = numpy.random.RandomState(1234)
            W_values = numpy.asarray(rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W_init = theano.shared(value=W_values, name='W', borrow=True)

        if b_init is None:
            b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
            b_init = theano.shared(value=b_values, name='b', borrow=True)

        self.W = W_init
        self.b = b_init
        # parameters of the model
        self.params = [self.W, self.b]
Example #6
0
 def __init__(self, adapted_iterator, n_batches, max_mem=8):
     self.adapted_iterator = adapted_iterator
     self.batch_size = adapted_iterator.get_batch_size()
     self.max_mem_bytes = max_mem * 1024 * 1024
     self.n_batches = n_batches
     self.arity = self.adapted_iterator.get_arity()
     adapted_batch_size = self.max_mem_bytes // adapted_iterator.get_datapoint_sizes().sum()  # Number o datapoints that can be held in max_mem
     adapted_batch_size = (adapted_batch_size // self.batch_size) * self.batch_size  # Make it hold an exact integer multiple of self.batch_size
     if self.n_batches is None:
         adapted_iterator.n_batches = None
     else:
         adapted_iterator.n_batches = np.inf  # The adapted iterator must loop forever, limiting the number of batches is now done here.
     adapted_iterator.set_batch_size(adapted_batch_size, True)  # True means get smaller final minibatch
     # Create buffers for each of the elements
     self.buffers = []
     self.minibatch = []
     for i, dimensionality in enumerate(self.adapted_iterator.get_datapoint_dimensionalities()):
         self.buffers.append(theano.shared(np.zeros((adapted_batch_size, dimensionality),
                                                    dtype=self.adapted_iterator.dataset.get_type(i)),
                                          name="buffer_%d" % i))  # The big buffer that holds many batches
         self.minibatch.append(theano.shared(value=np.zeros((self.batch_size, dimensionality),
                                                            dtype=self.adapted_iterator.dataset.get_type(i)),
                                             name="minibatch_%d" % i))
     self.buffer_index = 0
     self.datapoints_in_buffer = 0
 def shared(data):
     """ Place the data into shared variables. This allows Theano to copy
     the data to the GPU, if one is available.
     """
     shared_x = theano.shared(numpy.asarray(data[:,0].tolist(), dtype=theano.config.floatX), borrow=True)
     shared_y = theano.shared(numpy.asarray(data[:,1].tolist(), dtype=theano.config.floatX), borrow=True)
     return shared_x, T.cast(shared_y, "int32")
    def __init__(self, input, n_in, n_out):
        """ロジスティック回帰モデルの初期化

        input: ミニバッチ単位のデータ行列(n_samples, n_in)
        n_in : 入力の次元数
        n_out: 出力の次元数
        """
        # 重み行列を初期化
        self.W = theano.shared(value=np.zeros((n_in, n_out),
                                              dtype=theano.config.floatX),
                               name='W',
                               borrow=True)

        # バイアスベクトルを初期化
        self.b = theano.shared(value=np.zeros((n_out,),
                                              dtype=theano.config.floatX),
                               name='b',
                               borrow=True)

        # 各サンプルが各クラスに分類される確率を計算するシンボル
        # 全データを行列化してまとめて計算している
        # 出力は(n_samples, n_out)の行列
        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)

        # 確率が最大のクラスのインデックスを計算
        # 出力は(n_samples,)のベクトル
        self.y_pred = T.argmax(self.p_y_given_x, axis=1)

        # ロジスティック回帰モデルのパラメータ
        self.params = [self.W, self.b]
Example #9
0
    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
                 activation=T.tanh):
        self.input = input[0]

        # initialize weights into this layer
        if W is None:
            W_values = np.asarray(
                rng.uniform(
                    size=(n_in, n_out),
                    low=-np.sqrt(6. / (n_in + n_out)),
                    high=np.sqrt(6. / (n_in + n_out)),
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)

        # initialize bias term weights into this layer
        if b is None:
            b_values = np.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)

        self.W = W
        self.b = b

        lin_output = T.dot(self.input, self.W) + self.b
        self.output = (
            lin_output if activation is None
            else activation(lin_output)
        )

        self.params = [self.W, self.b]
Example #10
0
 def __init__(self, class_dim, word_dim, hidden_dim, sen_len, batch_size, truncate=-1):
     # Assign instance variables
     self.class_dim = class_dim
     self.word_dim = word_dim
     self.hidden_dim = hidden_dim
     self.sen_len = sen_len
     self.batch_size = batch_size
     self.truncate = truncate
     params = {}
     # Initialize the network parameters
     params["E"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))          #Ebdding Matirx
     params["W"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (4, hidden_dim, hidden_dim * 4)) #W[0-1].dot(x), W[2-3].(i,f,o,c)
     params["B"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (2, hidden_dim * 4))             #B[0-1] for W[0-1]
     params["lrW"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (2, hidden_dim, class_dim))         #LR W and b
     params["lrb"] = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (class_dim))
     
     # Assign paramenters' names 
     self.param_names = {"orign":["E", "W", "B", "lrW", "lrb"], 
                        "cache":["mE", "mW", "mB", "mlrW", "mlrb"]}
     # Theano: Created shared variables
     self.params = {}
     # Model's shared variables
     for _n in self.param_names["orign"]:
         self.params[_n] = theano.shared(value=params[_n].astype(theano.config.floatX), name=_n)
     # Shared variables for RMSProp
     for _n in self.param_names["cache"]:
         self.params[_n] = theano.shared(value=np.zeros(params[_n[1:]].shape).astype(theano.config.floatX), name=_n)
     # Build model graph
     self.__theano_build__()
Example #11
0
 def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
              activation_fn=sigmoid):
     """`filter_shape` is a tuple of length 4, whose entries are the number
     of filters, the number of input feature maps, the filter height, and the
     filter width.
     `image_shape` is a tuple of length 4, whose entries are the
     mini-batch size, the number of input feature maps, the image
     height, and the image width.
     `poolsize` is a tuple of length 2, whose entries are the y and
     x pooling sizes.
     """
     self.filter_shape = filter_shape
     self.image_shape = image_shape
     self.poolsize = poolsize
     self.activation_fn=activation_fn
     # initialize weights and biases
     n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
     self.w = theano.shared(
         np.asarray(
             np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
             dtype=theano.config.floatX),
         borrow=True)
     self.b = theano.shared(
         np.asarray(
             np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
             dtype=theano.config.floatX),
         borrow=True)
     self.params = [self.w, self.b]
Example #12
0
    def __init__(self, input, n_in, n_out):
        self.W = theano.shared(
            value = numpy.zeros(
                (n_in, n_out),
                dtype = theano.config.floatX
            ),
            name = 'W',
            borrow = True
        )

        self.b = theano.shared(
            value = numpy.zeros(
                (n_out,),
                dtype = theano.config.floatX
            ),
            name = 'b',
            borrow = True
        )

        self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)

        self.y_pred = T.argmax(self.p_y_given_x, axis = 1)

        self.params = [self.W, self.b]

        self.input = input
Example #13
0
    def init_conv_filters(self, numpy_rng, D, poolsize):
        ''' Convolutional Filters '''
        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = np.prod(self.filter_shape[1:])

        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" pooling size
        fan_out = (self.filter_shape[0] * np.prod(self.filter_shape[2:]) /
                   np.prod(poolsize))

        # initialize weights with random weights
        W_bound = np.sqrt(6. / (fan_in + fan_out))

        self.W = theano.shared(
                init_conv_weights(-W_bound, W_bound, \
                        self.filter_shape, numpy_rng),borrow=True, name='W_conv')

        #b_values = np.zeros((self.filter_shape[0],), dtype=theano.config.floatX)
        #self.b = theano.shared(value=b_values, borrow=True, name='b_conv')

        c_values = np.zeros((self.filter_shape[1],), dtype=theano.config.floatX)
        self.c = theano.shared(value=c_values, borrow=True, name='b_conv')

        self.params = [self.W, self.c]
Example #14
0
    def __init__(self, input_variable, rng, n_in=None, n_out=None, weights=None,
                 biases=None, activation=T.tanh):
        self.input_variable = input_variable
        if not weights:
            assert n_in is not None
            assert n_out is not None
            W_values = np.asarray(rng.uniform(
                low=-np.sqrt(6. / (n_in + n_out)),
                high=np.sqrt(6. / (n_in + n_out)),
                size=(n_in, n_out)), dtype=theano.config.floatX)
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4

            W = theano.shared(value=W_values, name='W', borrow=True)
            b_values = np.zeros((n_out,), dtype=theano.config.floatX)
            b = theano.shared(value=b_values, name='b', borrow=True)
        else:
            W = weights
            b = biases

        self.W = W
        self.b = b

        linear_output = T.dot(self.input_variable, self.W) + self.b
        self.output = (linear_output if activation is None
                       else activation(linear_output))
        self.params = [self.W, self.b]
Example #15
0
def stack_and_shared(input):
    """
    This will take a list of input variables, turn them into theano shared variables, and return them stacked
    in a single tensor.

    Parameters
    ----------
    input : list or object
        List of input variables to stack into a single shared tensor.

    Returns
    -------
    tensor
        Symbolic tensor of the input variables stacked, or None if input was None.
    """
    if input is None:
        return None
    elif isinstance(input, list):
        shared_ins = []
        for _in in input:
            try:
                shared_ins.append(theano.shared(_in))
            except TypeError as _:
                shared_ins.append(_in)
        return T.stack(shared_ins)
    else:
        try:
            _output = [theano.shared(input)]
        except TypeError as _:
            _output = [input]
        return T.stack(_output)
Example #16
0
def adam(lr, tparams, grads, inp, cost):
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function(inp, cost, updates=gsup)

    lr0 = 0.0002
    b1 = 0.1
    b2 = 0.001
    e = 1e-8

    updates = []

    i = theano.shared(numpy.float32(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr0 * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore')

    return f_grad_shared, f_update
Example #17
0
    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
        """
        Allocate a LeNetConvPoolLayer with shared variable internal parameters.

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dtensor4
        :param input: symbolic image tensor, of shape image_shape

        :type filter_shape: tuple or list of length 4
        :param filter_shape: (number of filters, num input feature maps,
                              filter height,filter width)

        :type image_shape: tuple or list of length 4
        :param image_shape: (batch size, num input feature maps,
                             image height, image width)

        :type poolsize: tuple or list of length 2
        :param poolsize: the downsampling (pooling) factor (#rows,#cols)
        """

        assert image_shape[1] == filter_shape[1]
        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = numpy.prod(filter_shape[1:])
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
                   numpy.prod(poolsize))
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(numpy.asarray(
            rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
            dtype=theano.config.floatX),
                               borrow=True)

        # the bias is a 1D tensor -- one bias per output feature map
        b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, borrow=True)

        # convolve input feature maps with filters
        conv_out = conv.conv2d(input=input, filters=self.W,
                filter_shape=filter_shape, image_shape=image_shape)

        # downsample each feature map individually, using maxpooling
        pooled_out = downsample.max_pool_2d(input=conv_out,
                                            ds=poolsize, ignore_border=True)

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))

        # store parameters of this layer
        self.params = [self.W, self.b]
Example #18
0
def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'):
    """
    adadelta update rule, mostly from
    https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
    """
    updates = OrderedDict({})
    exp_sqr_grads = OrderedDict({})
    exp_sqr_ups = OrderedDict({})
    gparams = []
    for param in params:
        empty = numpy.zeros_like(param.get_value())
        exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
        gp = T.grad(cost, param)
        exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
        gparams.append(gp)
    for param, gp in zip(params, gparams):
        exp_sg = exp_sqr_grads[param]
        exp_su = exp_sqr_ups[param]
        up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
        updates[exp_sg] = up_exp_sg
        step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
        updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
        stepped_param = param + step
        if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'):
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
            scale = desired_norms / (1e-7 + col_norms)
            tmp=stepped_param * scale
            tmp=T.cast(tmp,'float32')
            #print param.type,tmp.type
            updates[param] = tmp
        else:
            updates[param] = stepped_param
            #print param.type,stepped_param.type
    return updates 
Example #19
0
    def __init__(self, rng, input, n_in, n_out, W = None, b = None):

        self.input = input

        # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
        if W is None:
            W_value = rng.normal(0.0, 1.0/numpy.sqrt(n_in), size=(n_in, n_out))
            W = theano.shared(value=numpy.asarray(W_value, dtype=theano.config.floatX), name='W', borrow=True)

        if b is None:
            b = theano.shared(value=numpy.zeros((n_out,),
                                        dtype=theano.config.floatX),
                                   name='b', borrow=True)

        self.W = W
        self.b = b

        self.delta_W = theano.shared(value = numpy.zeros((n_in,n_out),
                                     dtype=theano.config.floatX), name='delta_W')

        self.delta_b = theano.shared(value = numpy.zeros_like(self.b.get_value(borrow=True),
                                     dtype=theano.config.floatX), name='delta_b')

        self.output = T.dot(self.input, self.W) + self.b

        self.params = [self.W, self.b]
        self.delta_params = [self.delta_W, self.delta_b]
Example #20
0
def adadelta(lr,tparams,grads,x,mask,y,cost):
    zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    #梯度更新字典
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [], updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Example #21
0
def sigmoid_layer(input, n_in, n_out, rng):
    w_init = rng.uniform(
        low=-4 * np.sqrt(6.0 / (n_in + n_out)), high=4 * np.sqrt(6.0 / (n_in + n_out)), size=(n_in, n_out)
    )
    W = theano.shared(np.asarray(w_init, dtype=theano.config.floatX), name="W", borrow=True)
    b = theano.shared(np.zeros((n_out,), dtype=theano.config.floatX), name="b", borrow=True)
    return T.nnet.sigmoid(T.dot(input, W) + b), [W, b]
Example #22
0
    def __init__(self, rng, input, n_in, n_out, W=None, b=None,
                 activation=T.tanh):
        if W is None:
            W_values = numpy.asarray(
                rng.uniform(
                    low=-numpy.sqrt(6. / (n_in + n_out)),
                    high=numpy.sqrt(6. / (n_in + n_out)),
                    size=(n_in, n_out)
                ),
                dtype=theano.config.floatX
            )
            if activation == theano.tensor.nnet.sigmoid:
                W_values *= 4
                
            W_branches = theano.shared(value=W_values, name='W_branches', borrow=True)
                
            if b is None:
                    b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
                    b_1 = theano.shared(value=b_values, name='b_1', borrow=True)

        self.W_branches = W_branches
        self.b_1 = b_1

        sub_branch_type = ""
        z_i = T.concatenate(self.W_branches[sub_branch_type] + self.W_branches[sub_branch_dist])
        
        # self.output = 

        self.params = [self.W_branches, self.b_1]
Example #23
0
 def setup(self, prev_layer):
     self.input_layer = prev_layer
     self.input = prev_layer.output
     self.W = theano.shared(np.random.random((self.input_layer.output_shape, self.output_shape)).astype(theano.config.floatX)*.01)
     self.b = theano.shared(np.zeros(self.output_shape,dtype=theano.config.floatX))
     self.params = (self.W, self.b)
     self.output = self.activation(T.dot(self.input, self.W) + self.b.dimshuffle('x', 0))
Example #24
0
def adadelta(lr, tparams, grads, inp, cost, extra_ups=[], extra_outs=[],
             exclude_params=set([])):
    '''Adadelta'''
    zipped_grads = [theano.shared(p.get_value() * np.float32(0.), name='%s_grad'%k)
                    for k, p in tparams.iteritems()]
    running_up2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rup2'%k)
                   for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * np.float32(0.), name='%s_rgrad2'%k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
        for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(
        inp, [cost]+extra_outs, updates=zgup+rg2up+extra_ups, profile=profile)

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
        for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tools.itemlist(tparams), updir)
        if p.name not in exclude_params]

    if not isinstance(lr, list): lr = [lr]
    f_update = theano.function(lr, [], updates=ru2up+param_up,
        on_unused_input='ignore', profile=profile)

    return f_grad_shared, f_update
Example #25
0
  def __init__(self, kernel, max_iter = 10, max_diff = None):
    """

    :param kernel: a function with a signature (expected, observed) -> a similarity measure
    that accepts symbolic theano expressions and returns them accordingly.
    See `crayimage.hotornot.em.kernels` for examples.
    :param max_iter: maximal number of iteration
    :param max_diff: stop iterations if maximal difference in weights from the previous iteration is smaller than `max_diff`.
    If None the check is not performed.
    """
    self.original_shape = None

    self.kernel = kernel
    self.max_iter = max_iter
    self.max_diff = max_diff

    self.X = theano.shared(
      np.zeros(shape=(0, 0), dtype='float32')
    )

    self.weights = theano.shared(
      np.ones(shape=(0, ), dtype='float32')
    )

    canonical = T.sum(self.weights[:, None] * self.X, axis=0) / T.sum(self.weights)

    weights_updates = self.kernel(canonical, self.X)
    weights_diff = T.max(abs(weights_updates - self.weights))

    upd = {
      self.weights : weights_updates
    }

    self.iteration = theano.function([], weights_diff if max_diff is not None else [], updates=upd)
    self.get_canonical = theano.function([], canonical)
Example #26
0
    def __init__(self, network, **kwargs):
        # due to the way that theano handles updates, we cannot update a
        # parameter twice during the same function call. so, instead of handling
        # everything in the updates for self.f_learn(...), we split the
        # parameter updates into two function calls. the first "prepares" the
        # parameters for the gradient computation by moving the entire model one
        # step according to the current velocity. then the second computes the
        # gradient at that new model position and performs the usual velocity
        # and parameter updates.

        self.params = network.params(**kwargs)
        self.momentum = kwargs.get('momentum', 0.5)

        # set up space for temporary variables used during learning.
        self._steps = []
        self._velocities = []
        for param in self.params:
            v = param.get_value()
            n = param.name
            self._steps.append(theano.shared(np.zeros_like(v), name=n + '_step'))
            self._velocities.append(theano.shared(np.zeros_like(v), name=n + '_vel'))

        # step 1. move to the position in parameter space where we want to
        # compute our gradient.
        prepare = []
        for param, step, velocity in zip(self.params, self._steps, self._velocities):
            prepare.append((step, self.momentum * velocity))
            prepare.append((param, param + step))

        logging.info('compiling NAG adjustment function')
        self.f_prepare = theano.function([], [], updates=prepare)

        super(NAG, self).__init__(network, **kwargs)
Example #27
0
    def _init_params(self):
        self.W_hhs = []
        self.W_shortp = []
        for dx in xrange(self.n_layers):
            W_hh = self.init_fn[dx](self.n_hids[(dx-1)%self.n_layers],
                                        self.n_hids[dx],
                                        self.sparsity[dx],
                                        self.scale[dx],
                                        rng=self.rng)
            self.W_hhs.append(theano.shared(value=W_hh, name="W%d_%s" %
                                       (dx,self.name)))

            if dx > 0:
                W_shp = self.init_fn[dx](self.n_hids[self.n_layers-1],
                                         self.n_hids[dx],
                                         self.sparsity[dx],
                                         self.scale[dx],
                                         rng=self.rng)
                self.W_shortp.append(theano.shared(value=W_shp,
                                               name='W_s%d_%s'%(dx,self.name)))
        self.params = [x for x in self.W_hhs] +\
                [x for x in self.W_shortp]

        self.params_grad_scale = [self.grad_scale for x in self.params]
        self.restricted_params = [x for x in self.params]

        if self.weight_noise:
            self.nW_hhs = [theano.shared(x.get_value()*0, name='noise_'+x.name) for x in self.W_hhs]
            self.nW_shortp = [theano.shared(x.get_value()*0, name='noise_'+x.name) for x in self.W_shortp]

            self.noise_params = [x for x in self.nW_hhs] + [x for x in self.nW_shortp]
            self.noise_params_shape_fn = [constant_shape(x.get_value().shape) for x in self.noise_params]
Example #28
0
File: base.py Project: glouppe/carl
def check_parameter(name, value):
    parameters = set()
    constants = set()
    observeds = set()

    if isinstance(value, SharedVariable):
        parameters.add(value)
    elif isinstance(value, T.TensorConstant):
        constants.add(value)
    elif isinstance(value, T.TensorVariable):
        inputs = graph.inputs([value])

        for var in inputs:
            if isinstance(var, SharedVariable):
                parameters.add(var)
            elif isinstance(var, T.TensorConstant):
                constants.add(var)
            elif isinstance(var, T.TensorVariable):
                if not var.name:
                    raise ValueError("Observed variables must be named.")
                observeds.add(var)
    else:
        # XXX allow for lists and convert them to ndarray

        if isinstance(value, np.ndarray):
            value = theano.shared(value, name=name)
        else:
            value = theano.shared(float(value), name=name)

        parameters.add(value)

    return value, parameters, constants, observeds
Example #29
0
    def optimizer(loss, param):
        updates = OrderedDict()
        if param is not list:
            param = list(param)

        for param_ in param:
            i = theano.shared(np.array(0, dtype=theano.config.floatX))
            i_int = i.astype('int64')
            value = param_.get_value(borrow=True)
            accu = theano.shared(
                np.zeros(value.shape + (n_win,), dtype=value.dtype))
            grad = tt.grad(loss, param_)

            # Append squared gradient vector to accu_new
            accu_new = tt.set_subtensor(accu[:, i_int], grad ** 2)
            i_new = tt.switch((i + 1) < n_win, i + 1, 0)

            updates[accu] = accu_new
            updates[i] = i_new

            accu_sum = accu_new.sum(axis=1)
            updates[param_] = param_ - (learning_rate * grad /
                                        tt.sqrt(accu_sum + epsilon))

        return updates
 def generate_beta_arr(self, step1_beta):
     """
     Generate the noise covariances, beta_t, for the forward trajectory.
     """
     # lower bound on beta
     min_beta_val = 1e-6
     min_beta_values = np.ones((self.trajectory_length,))*min_beta_val
     min_beta_values[0] += step1_beta
     min_beta = theano.shared(value=min_beta_values.astype(theano.config.floatX),
         name='min beta')
     # (potentially learned) function for how beta changes with timestep
     # TODO add beta_perturb_coefficients to the parameters to be learned
     beta_perturb_coefficients_values = np.zeros((self.n_temporal_basis,))
     beta_perturb_coefficients = theano.shared(
         value=beta_perturb_coefficients_values.astype(theano.config.floatX),
         name='beta perturb coefficients')
     beta_perturb = T.dot(self.temporal_basis.T, beta_perturb_coefficients)
     # baseline behavior of beta with time -- destroy a constant fraction
     # of the original data variance each time step
     # NOTE 2 below means a fraction ~1/T of the variance will be left at the end of the
     # trajectory
     beta_baseline = 1./np.linspace(self.trajectory_length, 2., self.trajectory_length)
     beta_baseline_offset = util.logit_np(beta_baseline).astype(theano.config.floatX)
     # and the actual beta_t, restricted to be between min_beta and 1-[small value]
     beta_arr = T.nnet.sigmoid(beta_perturb + beta_baseline_offset)
     beta_arr = min_beta + beta_arr * (1 - min_beta - 1e-5)
     beta_arr = beta_arr.reshape((self.trajectory_length, 1))
     return beta_arr
Example #31
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        encoder='gru',
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        diag_c=0.,
        lrate=0.01,
        n_words=100000,
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq updates
        dataset='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz',
        valid_dataset='../data/dev/newstest2011.en.tok',
        dictionary='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz.pkl',
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
          x, x_mask, \
          opt_ret, \
          cost = \
          build_model(tparams, model_options)
    inps = [x, x_mask]

    print 'Buliding sampler'
    f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    print 'Building f_grad...',
    f_grad = theano.function(inps, grads, profile=profile)
    print 'Done'

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words)

            if x == None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()
            cost = f_grad_shared(x, x_mask)
            f_update(lrate)
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                #import ipdb; ipdb.set_trace()

                if best_p != None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(5):
                    sample, score = gen_sample(tparams,
                                               f_next,
                                               model_options,
                                               trng=trng,
                                               maxlen=30,
                                               argmax=False)
                    print 'Sample ', jj, ': ',
                    ss = sample
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r:
                            print worddicts_r[vv],
                        else:
                            print 'UNK',
                    print

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= numpy.array(
                        history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print 'Valid ', valid_err

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                           valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
Example #32
0
    def create_recursive_unit(self):
        self.W_i = theano.shared(
            self.init_matrix([self.hidden_dim, self.emb_dim]))
        self.U_i = theano.shared(
            self.init_matrix([self.hidden_dim, self.hidden_dim]))
        self.b_i = theano.shared(self.init_vector([self.hidden_dim]))
        self.W_f = theano.shared(
            self.init_matrix([self.hidden_dim, self.emb_dim]))
        self.U_f = theano.shared(
            self.init_matrix([self.hidden_dim, self.hidden_dim]))
        self.b_f = theano.shared(self.init_vector([self.hidden_dim]))
        self.W_o = theano.shared(
            self.init_matrix([self.hidden_dim, self.emb_dim]))
        self.U_o = theano.shared(
            self.init_matrix([self.hidden_dim, self.hidden_dim]))
        self.b_o = theano.shared(self.init_vector([self.hidden_dim]))
        self.W_u = theano.shared(
            self.init_matrix([self.hidden_dim, self.emb_dim]))
        self.U_u = theano.shared(
            self.init_matrix([self.hidden_dim, self.hidden_dim]))
        self.b_u = theano.shared(self.init_vector([self.hidden_dim]))
        self.params.extend([
            self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f,
            self.W_o, self.U_o, self.b_o, self.W_u, self.U_u, self.b_u
        ])

        def unit(parent_x, child_h, child_c, child_exists):
            h_tilde = T.sum(child_h, axis=0)
            i = T.nnet.sigmoid(
                T.dot(self.W_i, parent_x) + T.dot(self.U_i, h_tilde) +
                self.b_i)
            o = T.nnet.sigmoid(
                T.dot(self.W_o, parent_x) + T.dot(self.U_o, h_tilde) +
                self.b_o)
            u = T.tanh(
                T.dot(self.W_u, parent_x) + T.dot(self.U_u, h_tilde) +
                self.b_u)
            f = (T.nnet.sigmoid(
                T.dot(self.W_f, parent_x).dimshuffle('x', 0) +
                T.dot(child_h, self.U_f.T) + self.b_f.dimshuffle('x', 0)) *
                 child_exists.dimshuffle(0, 'x'))

            c = i * u + T.sum(f * child_c, axis=0)
            h = o * T.tanh(c)
            return h, c

        return unit
Example #33
0
 def __init__(self,
              num_emb,
              tag_size,
              emb_dim,
              hidden_dim,
              output_dim,
              degree=2,
              learning_rate=0.01,
              momentum=0.9,
              trainable_embeddings=True,
              labels_on_nonroot_nodes=False,
              irregular_tree=False,
              pairwise=True):
     assert emb_dim > 1 and hidden_dim > 1
     self.num_emb = num_emb
     self.tag_size = tag_size
     self.emb_dim = emb_dim
     self.hidden_dim = hidden_dim
     self.output_dim = output_dim
     self.degree = degree
     self.learning_rate = learning_rate
     self.L2_ratio = L2_RATIO
     self.Pairwise = pairwise
     self.params = []
     np.random.seed(SEED)
     #self.embeddings = theano.shared(self.init_matrix([self.num_emb, self.emb_dim]))
     self.embeddings = theano.shared(
         self.init_matrix([self.num_emb, self.emb_dim]))
     self.params.append(self.embeddings)
     self.recursive_unit = self.create_recursive_unit()
     self.leaf_unit = self.create_leaf_unit()
     #self.output_fn = self.create_output_fn()
     self.score_fn = self.create_score_fn()
     self.x1 = T.ivector(name='x1')  # word indices
     self.x2 = T.ivector(name='x2')  # word indices
     self.tag_1 = T.ivector(name='tag1')  # word indices
     self.tag_2 = T.ivector(name='tag2')  # word indices
     self.x1_2 = T.ivector(name='x1_2')  # word indices
     self.x2_1 = T.ivector(name='x2_1')  # word indices
     self.num_words = self.x1.shape[0]
     self.emb_x1 = self.embeddings[self.x1]
     self.emb_x1 = self.emb_x1 * T.neq(self.x1, -1).dimshuffle(
         0, 'x')  # zero-out non-existent embeddings
     self.emb_x2 = self.embeddings[self.x2]
     self.emb_x2 = self.emb_x2 * T.neq(self.x2, -1).dimshuffle(
         0, 'x')  # zero-out non-existent embeddings
     self.tree_1 = T.imatrix(name='tree1')  # shape [None, self.degree]
     self.tree_2 = T.imatrix(name='tree2')  # shape [None, self.degree]
     self.tree_3 = T.imatrix(name='tree3')  # shape [None, self.degree]
     self.tree_4 = T.imatrix(name='tree4')  # shape [None, self.degree]
     self.tree_states_1, self.score1 = self.compute_tree(
         self.emb_x1, self.tree_1[:, :-1])
     self.tree_states_2, self.score2 = self.compute_tree(
         self.emb_x2, self.tree_2[:, :-1])
     #self._compute_emb = theano.function([self.x1,self.tree_1],self.tree_states_1)
     if self.Pairwise:
         self.forget_unit = self.create_forget_gate_fun()
         self._train_pairwise, self._predict_pair = self.create_pairwise_rank(
         )
     else:
         self._predict, self._train_pointwise = self.create_pointwise_rank()
import theano

matrix_times_vector = theano.function(inputs = [A, v], outputs = [w])

# let's import numpy so we can create real arrays
import numpy as np
A_val = np.array([[1, 2], [3, 4]])
v_val = np.array([5, 6])

w_val = matrix_times_vector(A_val, v_val)
print(w_val)

# let's create a shared variable to we can do gradient descent
# this adds another layer of complexity to the theano function
# the first argument is its initial value, the second is its name
x = theano.shared(20.0, 'x')

# a cost function that has a minimum value
cost = x*x + x + 1

# in theano, you don't have to compute gradients yourself!
x_update = x - 0.3 * T.grad(cost, x)

# x is not an "input", it's a thing you update
# in later examples, data and labels would go into the inputs
# and model params would go in the updates
# updates takes in a list of tuples, each tuple has 2 things in it:
# 1) the shared variable to update, 2) the update expression
train = theano.function(inputs = [], outputs = cost, updates = [(x, x_update)])

# write your own loop to call the training function.
    def __init__(
        self,
        X_data: np.ndarray,
        Y_data: np.ndarray,
        data_type: str = 'float32',
        n_iter = 200000,
        learning_rate = 0.001,
        total_grad_norm_constraint = 200,
        verbose = True,
        var_names=None, var_names_read=None,
        obs_names=None, fact_names=None, sample_id=None,
        n_factors = 7,
        cutoff_poisson = 1000,
        h_alpha = 1
    ):
        
        ############# Initialise parameters ################
        super().__init__(X_data, 0,
                         data_type, n_iter, 
                         learning_rate, total_grad_norm_constraint,
                         verbose, var_names, var_names_read,
                         obs_names, fact_names, sample_id)
        self.Y_data = Y_data
        self.y_data = theano.shared(Y_data.astype(self.data_type))
        self.n_rois = Y_data.shape[0]
        self.l_r = np.array([np.sum(X_data[i,:]) for i in range(self.n_rois)]).reshape(self.n_rois,1)/self.n_genes
        self.n_factors = n_factors
        self.n_npro = Y_data.shape[1]
        self.cutoff_poisson = cutoff_poisson
        self.poisson_residual = self.X_data < self.cutoff_poisson
        self.gamma_residual = self.X_data > self.cutoff_poisson
        self.X_data1 = self.X_data[self.poisson_residual]
        self.X_data2 = self.X_data[self.gamma_residual]
        self.genes = var_names
        self.sample_names = obs_names
        self.h_alpha = h_alpha
        
        ############# Define the model ################
        self.model = pm.Model()
        with self.model:
            
            ### Negative Probe Counts ###
            
            # Prior for distribution of negative probe count levels:
            self.b_n_hyper = pm.Gamma('b_n_hyper', alpha = np.array((3,1)), beta = np.array((1,1)), shape = 2)
            self.b_n = pm.Gamma('b_n', mu = self.b_n_hyper[0], sigma = self.b_n_hyper[1], shape = (1,self.n_npro))
            self.y_rn = self.b_n*self.l_r
            
            ### Gene Counts ###
            
            # Background for gene probes, drawn from the same distribution as negative probes:
            self.b_g = pm.Gamma('b_g', mu = self.b_n_hyper[0], sigma = self.b_n_hyper[1], shape = (1,self.n_genes))

            # Gene expression modeled as combination of non-negative factors:
            self.h_hyp = pm.Gamma('h_hyp', 1, 1, shape = 1)
            self.h = pm.Gamma('h', alpha = 1, beta = self.h_hyp, shape=(self.n_genes, self.n_factors))
            self.w_hyp = pm.Gamma('w_hyp', np.array((1,1)), np.array((1,1)), shape=(self.n_factors,2))
            self.w = pm.Gamma('w', mu=self.w_hyp[:,0], sigma=self.w_hyp[:,1], shape=(self.n_rois, self.n_factors))
            self.a_gr =  pm.Deterministic('a_gr', pm.math.dot(self.w, self.h.T))
            
            # Expected gene counts are sum of gene expression and background counts, scaled by library size:
            self.x_rg = (self.a_gr + self.b_g)*self.l_r
            
            self.data_target = pm.DensityDist('data_target', self.get_logDensity, observed=tt.concatenate([self.y_data, self.x_data], axis = 1))
Example #36
0
def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))
Example #37
0
    def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2)):
        """
        Allocate a LeNetConvPoolLayer with shared variable internal parameters.

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.dtensor4
        :param input: symbolic image tensor, of shape image_shape

        :type filter_shape: tuple or list of length 4
        :param filter_shape: (number of filters, num input feature maps,
                              filter height, filter width)

        :type image_shape: tuple or list of length 4
        :param image_shape: (batch size, num input feature maps,
                             image height, image width)

        :type poolsize: tuple or list of length 2
        :param poolsize: the downsampling (pooling) factor (#rows, #cols)
        """

        assert image_shape[1] == filter_shape[1]
        self.input = input

        # there are "num input feature maps * filter height * filter width"
        # inputs to each hidden unit
        fan_in = numpy.prod(filter_shape[1:])
        # each unit in the lower layer receives a gradient from:
        # "num output feature maps * filter height * filter width" /
        #   pooling size
        fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) /
                   numpy.prod(poolsize))
        # initialize weights with random weights
        W_bound = numpy.sqrt(6. / (fan_in + fan_out))
        self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound,
                                                         high=W_bound,
                                                         size=filter_shape),
                                             dtype=theano.config.floatX),
                               borrow=True)

        # the bias is a 1D tensor -- one bias per output feature map
        b_values = numpy.zeros((filter_shape[0], ), dtype=theano.config.floatX)
        self.b = theano.shared(value=b_values, borrow=True)

        # convolve input feature maps with filters
        conv_out = conv.conv2d(input=input,
                               filters=self.W,
                               filter_shape=filter_shape,
                               image_shape=image_shape)

        # downsample each feature map individually, using maxpooling
        pooled_out = downsample.max_pool_2d(input=conv_out,
                                            ds=poolsize,
                                            ignore_border=True)

        # add the bias term. Since the bias is a vector (1D array), we first
        # reshape it to a tensor of shape (1, n_filters, 1, 1). Each bias will
        # thus be broadcasted across mini-batches and feature map
        # width & height
        self.output = T.tanh(pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))

        # store parameters of this layer
        self.params = [self.W, self.b]

        # keep track of model input
        self.input = input
Example #38
0
def init_tparams(params):
    tparams = OrderedDict()
    for k, v in params.iteritems():
        tparams[k] = theano.shared(v, name=k)
    return tparams
    def __init__(self, We_initial, params):
        initial_We = theano.shared(np.asarray(We_initial, dtype=config.floatX))
        We = theano.shared(np.asarray(We_initial, dtype=config.floatX))
        if params.npc > 0:
            pc = theano.shared(np.asarray(params.pc, dtype=config.floatX))

        g1batchindices = T.imatrix()
        g1mask = T.matrix()
        scores = T.matrix()

        l_in = lasagne.layers.InputLayer((None, None))
        l_mask = lasagne.layers.InputLayer(shape=(None, None))
        l_emb = lasagne.layers.EmbeddingLayer(
            l_in,
            input_size=We.get_value().shape[0],
            output_size=We.get_value().shape[1],
            W=We)
        l_average = lasagne_average_layer([l_emb, l_mask])
        l_out = lasagne.layers.DenseLayer(l_average,
                                          params.layersize,
                                          nonlinearity=params.nonlinearity)
        embg = lasagne.layers.get_output(l_out, {
            l_in: g1batchindices,
            l_mask: g1mask
        })
        if params.npc <= 0:
            print "#pc <=0, do not remove pc"
        elif params.npc == 1:
            print "#pc == 1"
            proj = embg.dot(pc.transpose())
            embg = embg - theano.tensor.outer(proj, pc)
        else:
            print "#pc > 1"
            proj = embg.dot(pc.transpose())
            embg = embg - theano.tensor.dot(proj, pc)

        l_in2 = lasagne.layers.InputLayer((None, params.layersize))
        l_sigmoid = lasagne.layers.DenseLayer(
            l_in2, params.memsize, nonlinearity=lasagne.nonlinearities.sigmoid)

        l_softmax = lasagne.layers.DenseLayer(l_sigmoid,
                                              2,
                                              nonlinearity=T.nnet.softmax)
        X = lasagne.layers.get_output(l_softmax, {l_in2: embg})
        cost = T.nnet.categorical_crossentropy(X, scores)
        prediction = T.argmax(X, axis=1)

        self.network_params = lasagne.layers.get_all_params(
            l_out, trainable=True) + lasagne.layers.get_all_params(
                l_softmax, trainable=True)
        self.network_params.pop(
            0)  # do not include the word embedding as network parameters
        self.all_params = lasagne.layers.get_all_params(
            l_out, trainable=True) + lasagne.layers.get_all_params(
                l_softmax, trainable=True)

        reg = self.getRegTerm(params, We, initial_We)
        self.trainable = self.getTrainableParams(params)
        cost = T.mean(cost) + reg

        self.feedforward_function = theano.function([g1batchindices, g1mask],
                                                    embg)
        self.scoring_function = theano.function([g1batchindices, g1mask],
                                                prediction)
        self.cost_function = theano.function([scores, g1batchindices, g1mask],
                                             cost)

        grads = theano.gradient.grad(cost, self.trainable)
        if params.clip:
            grads = [
                lasagne.updates.norm_constraint(grad, params.clip,
                                                range(grad.ndim))
                for grad in grads
            ]
        updates = params.learner(grads, self.trainable, params.eta)
        self.train_function = theano.function([scores, g1batchindices, g1mask],
                                              cost,
                                              updates=updates)
Example #40
0
def init_tparams(params):
    tparams = OrderedDict()
    for kk, pp in params.iteritems():
        tparams[kk] = theano.shared(params[kk], name=kk)
    return tparams
Example #41
0
    def __init__(self,
                 Nlayers = 1,               # number of layers
                 Ndirs = 1,                 # unidirectional or bidirectional
                 Nx = 100,                  # input size
                 Nh = 100,                  # hidden layer size
                 Ny = 100,                  # output size
                 Ah = "relu",               # hidden unit activation (e.g. relu, tanh, lstm)
                 Ay = "linear",             # output unit activation (e.g. linear, sigmoid, softmax)
                 predictPer = "frame",      # frame or sequence
                 loss = None,               # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge)
                 L1reg = 0.0,               # L1 regularization
                 L2reg = 0.0,               # L2 regularization
                 momentum = 0.0,            # SGD momentum
                 seed = 15213,              # random seed for initializing the weights
                 frontEnd = None,           # a lambda function for transforming the input
                 filename = None,           # initialize from file
                 initParams = None,         # initialize from given dict
                ):

        if filename is not None:            # load parameters from file
            with smart_open(filename, "rb") as f:
                initParams = dill.load(f)
        if initParams is not None:          # load parameters from given dict
            self.paramNames = []
            self.params = []
            for k, v in initParams.iteritems():
                if type(v) is numpy.ndarray:
                    self.addParam(k, v)
                else:
                    setattr(self, k, v)
                    self.paramNames.append(k)
            # F*ck, locals()[k] = v doesn't work; I have to do this statically
            Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \
                = self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd
        else:                           # Initialize parameters randomly
            # Names of parameters to save to file
            self.paramNames = ["Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd"]
            for name in self.paramNames:
                value = locals()[name]
                setattr(self, name, value)

            # Values of parameters for building the computational graph
            self.params = []

            # Initialize random number generators
            global rng
            rng = numpy.random.RandomState(seed)

            # Construct parameter matrices
            Nlstm = 4 if Ah == 'lstm' else 1
            self.addParam("Win", rand_init((Nx, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wrec", rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah))
            self.addParam("Wup", rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wout", rand_init((Nh * Ndirs, Ny), Ay))
            if Ah != "lstm":
                self.addParam("Bhid", zeros((Nlayers, Nh * Ndirs)))
            else:
                self.addParam("Bhid", numpy.tile(numpy.hstack([full((Nlayers, Nh), 1.0), zeros((Nlayers, Nh * 3))]), (1, Ndirs)))
            self.addParam("Bout", zeros(Ny))
            self.addParam("h0", zeros((Nlayers, Ndirs, Nh)))
            if Ah == "lstm":
                self.addParam("c0", zeros((Nlayers, Ndirs, Nh)))

        # Compute total number of parameters
        self.nParams = sum(x.get_value().size for x in self.params)

        # Initialize gradient tensors when using momentum
        if momentum > 0:
            self.dparams = [theano.shared(zeros(x.get_value().shape)) for x in self.params]

        # Build computation graph
        input = T.ftensor3()
        mask = T.imatrix()
        mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()]
        mask_float = [T.cast((mask % 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
                      T.cast((mask >= 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]
        # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()]
        # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
        #               T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]

        def step_rnn(x_t, mask, h_tm1, W, h0):
            h_tm1 = T.switch(mask, h0, h_tm1)
            return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]

        def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0):
            c_tm1 = T.switch(mask, c0, c_tm1)
            h_tm1 = T.switch(mask, h0, h_tm1)
            a = x_t + h_tm1.dot(W)
            f_t = T.nnet.sigmoid(a[:, :Nh])
            i_t = T.nnet.sigmoid(a[:, Nh : Nh * 2])
            o_t = T.nnet.sigmoid(a[:, Nh * 2 : Nh * 3])
            c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t
            h_t = T.tanh(c_t) * o_t
            return [c_t, h_t]

        x = input if frontEnd is None else frontEnd(input)
        for i in range(Nlayers):
            h = (x.dimshuffle((1, 0, 2)).dot(self.Win) if i == 0 else h.dot(self.Wup[i-1])) + self.Bhid[i]
            rep = lambda x: T.extra_ops.repeat(x.reshape((1, -1)), h.shape[1], axis = 0)
            if Ah != "lstm":
                h = T.concatenate([theano.scan(
                        fn = step_rnn,
                        sequences = [h[:, :, Nh * d : Nh * (d+1)], mask_float[d]],
                        outputs_info = [rep(self.h0[i, d])],
                        non_sequences = [self.Wrec[i, d], rep(self.h0[i, d])],
                        go_backwards = (d == 1),
                    )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2)
            else:
                h = T.concatenate([theano.scan(
                        fn = step_lstm,
                        sequences = [h[:, :, Nh * 4 * d : Nh * 4 * (d+1)], mask_float[d]],
                        outputs_info = [rep(self.c0[i, d]), rep(self.h0[i, d])],
                        non_sequences = [self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d])],
                        go_backwards = (d == 1),
                    )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 2)
        h = h.dimshuffle((1, 0, 2))
        if predictPer == "sequence":
            h = T.concatenate([h[mask_int[1 - d]][:, Nh * d : Nh * (d+1)] for d in range(Ndirs)], axis = 1)
        output = ACTIVATION[Ay](h.dot(self.Wout) + self.Bout)

        # Compute loss function
        if loss is None:
            loss = {"linear": "mse", "sigmoid": "ce", "softmax": "ce_group"}[self.Ay]
        if loss == "ctc":
            label = T.imatrix()
            cost = ctc_cost(output, mask, label)
        else:
            if predictPer == "sequence":
                label = T.fmatrix()
                y = output
                t = label
            elif predictPer == "frame":
                label = T.ftensor3()
                indices = (mask >= 0).nonzero()
                y = output[indices]
                t = label[indices]
            cost = T.mean({
                "ce":               -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis = 1),
                "ce_group":         -T.log((y * t).sum(axis = 1)),
                "mse":              T.mean((y - t) ** 2, axis = 1),
                "hinge":            T.mean(relu(1 - y * (t * 2 - 1)), axis = 1),
                "squared_hinge":    T.mean(relu(1 - y * (t * 2 - 1)) ** 2, axis = 1),
            }[loss])

        # Add regularization
        cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg
        cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg

        # Compute updates for network parameters
        updates = []
        lrate = T.fscalar()
        clip = T.fscalar()
        grad = T.grad(cost, self.params)
        grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad]
        if momentum > 0:
            for w, d, g in zip(self.params, self.dparams, grad_clipped):
                updates.append((w, w + momentum * momentum * d - (1 + momentum) * lrate * g))
                updates.append((d, momentum * d - lrate * g))
        else:
            for w, g in zip(self.params, grad_clipped):
                updates.append((w, w - lrate * g))

        # Create functions to be called from outside
        self.train = theano.function(
                         inputs = [input, mask, label, lrate, clip],
                         outputs = cost,
                         updates = updates,
                     )

        self.predict = theano.function(inputs = [input, mask], outputs = output)
Example #42
0
def adadelta(tparams,
             grads,
             x,
             mask,
             iVector,
             jVector,
             cost,
             options,
             d=None,
             y=None):
    zipped_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    running_up2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k)
        for k, p in tparams.iteritems()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)
        for k, p in tparams.iteritems()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    if options['demoSize'] > 0 and options['numYcodes'] > 0:
        f_grad_shared = theano.function([x, d, y, mask, iVector, jVector],
                                        cost,
                                        updates=zgup + rg2up,
                                        name='adadelta_f_grad_shared')
    elif options['demoSize'] == 0 and options['numYcodes'] > 0:
        f_grad_shared = theano.function([x, y, mask, iVector, jVector],
                                        cost,
                                        updates=zgup + rg2up,
                                        name='adadelta_f_grad_shared')
    elif options['demoSize'] > 0 and options['numYcodes'] == 0:
        f_grad_shared = theano.function([x, d, mask, iVector, jVector],
                                        cost,
                                        updates=zgup + rg2up,
                                        name='adadelta_f_grad_shared')
    else:
        f_grad_shared = theano.function([x, mask, iVector, jVector],
                                        cost,
                                        updates=zgup + rg2up,
                                        name='adadelta_f_grad_shared')

    updir = [
        -T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
        for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)
    ]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([], [],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Example #43
0
	def __init__(self, Mi, Mo, activation=T.nnet.relu): 
		self.Mi = Mi 
		self.Mo = Mo 
		self.f  = activation

		# Input gate weights
		Wxi = init_weight(Mi, Mo)
		Whi = init_weight(Mo, Mo)
		Wci = init_weight(Mo, Mo)
		bi 	= np.zeros(Mo)

		# Forget gate weights
		Wxf = init_weight(Mi, Mo)
		Whf = init_weight(Mo, Mo)
		Wcf = init_weight(Mo, Mo)
		bf 	= np.zeros(Mo)

		# Cell gate
		Wxc = init_weight(Mi, Mo)
		Whc = init_weight(Mo, Mo)
		bc 	= np.zeros(Mo)

		# Output gate
		Wxo = init_weight(Mi, Mo)
		Who = init_weight(Mo, Mo)
		Wco = init_weight(Mo, Mo)
		bo 	= np.zeros(Mo)


		c0 = np.zeros(Mo)
		h0 = np.zeros(Mo)

		#theano variables
		self.Wxi 	= theano.shared(Wxi)
		self.Whi 	= theano.shared(Whi)
		self.Wci 	= theano.shared(Wci)
		self.bi 	= theano.shared(bi)

		self.Wxf 	= theano.shared(Wxf)
		self.Whf 	= theano.shared(Whf)
		self.Wcf 	= theano.shared(Wcf)
		self.bf 	= theano.shared(bf)

		self.Wxc = theano.shared(Wxc)
		self.Whc = theano.shared(Whc)
		self.bc  = theano.shared(bc)

		self.Wxo 	= theano.shared(Wxo)
		self.Who 	= theano.shared(Who)
		self.Wco 	= theano.shared(Wco)
		self.bo 	= theano.shared(bo)

		self.h0 	= theano.shared(h0)
		self.c0 	= theano.shared(c0)
		

		self.params = [self.Wxi, self.Whi, self.Wci,  self.bi, self.Wxf, self.Whf, self.Wcf,  self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who, self.Wco,  self.bo , self.h0, self.c0 ]
def main():
    # step 1: get the data and define all the usual variables
    Xtrain, Xtest, Ytrain, Ytest = get_normalized_data()

    max_iter = 20
    print_period = 10

    lr = 0.0004
    reg = 0.01

    Xtrain = Xtrain.astype(np.float32)
    Ytrain = Ytrain.astype(np.float32)
    Xtest = Xtest.astype(np.float32)
    Ytest = Ytest.astype(np.float32)
    Ytrain_ind = y2indicator(Ytrain).astype(np.float32)
    Ytest_ind = y2indicator(Ytest).astype(np.float32)

    N, D = Xtrain.shape
    batch_sz = 500
    n_batches = N // batch_sz

    M = 300
    K = 10
    W1_init = np.random.randn(D, M) / np.sqrt(D)
    b1_init = np.zeros(M)
    W2_init = np.random.randn(M, K) / np.sqrt(M)
    b2_init = np.zeros(K)

    # step 2: define theano variables and expressions
    thX = T.matrix('X')
    thT = T.matrix('T')
    W1 = theano.shared(W1_init, 'W1')
    b1 = theano.shared(b1_init, 'b1')
    W2 = theano.shared(W2_init, 'W2')
    b2 = theano.shared(b2_init, 'b2')

    # we can use the built-in theano functions to do relu and softmax
    thZ = relu(
        thX.dot(W1) +
        b1)  # relu is new in version 0.7.1 but just in case you don't have it
    thY = T.nnet.softmax(thZ.dot(W2) + b2)

    # define the cost function and prediction
    cost = -(thT * T.log(thY)).sum() + reg * ((W1 * W1).sum() +
                                              (b1 * b1).sum() +
                                              (W2 * W2).sum() +
                                              (b2 * b2).sum())
    prediction = T.argmax(thY, axis=1)

    # step 3: training expressions and functions
    # we can just include regularization as part of the cost because it is also automatically differentiated!
    update_W1 = W1 - lr * T.grad(cost, W1)
    update_b1 = b1 - lr * T.grad(cost, b1)
    update_W2 = W2 - lr * T.grad(cost, W2)
    update_b2 = b2 - lr * T.grad(cost, b2)

    train = theano.function(
        inputs=[thX, thT],
        updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2),
                 (b2, update_b2)],
    )

    # create another function for this because we want it over the whole dataset
    get_prediction = theano.function(
        inputs=[thX, thT],
        outputs=[cost, prediction],
    )

    costs = []
    for i in range(max_iter):
        for j in range(n_batches):
            Xbatch = Xtrain[j * batch_sz:(j * batch_sz + batch_sz), ]
            Ybatch = Ytrain_ind[j * batch_sz:(j * batch_sz + batch_sz), ]

            train(Xbatch, Ybatch)
            if j % print_period == 0:
                cost_val, prediction_val = get_prediction(Xtest, Ytest_ind)
                err = error_rate(prediction_val, Ytest)
                print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" %
                      (i, j, cost_val, err))
                costs.append(cost_val)

    plt.plot(costs)
    plt.show()
Example #45
0
    def __init__(self, n_in, n_out):
        """
            In order to get this to work we need to be careful not to update the actor parameters
            when updatating the critic. This can be an issue when the Concatenating networks together.
            The first first network becomes a part of the second. Hoever you can still access the first
            network by itself but an updates on the sencond network will effect the first network.
            Care needs to be taken to make sure only the parameters of the second network are updated.
        """

        batch_size = 32
        state_length = n_in
        action_length = n_out
        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, state_length)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size, state_length)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.dmatrix("Action")
        Action.tag.test_value = np.random.rand(batch_size, action_length)
        # create a small convolutional neural network
        inputLayerActA = lasagne.layers.InputLayer((None, state_length), State)
        l_hid2ActA = lasagne.layers.DenseLayer(
            inputLayerActA,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActA = lasagne.layers.DenseLayer(
            l_hid2ActA,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActA = lasagne.layers.DenseLayer(
            l_hid3ActA,
            num_units=n_out,
            nonlinearity=lasagne.nonlinearities.linear)

        inputLayerA = lasagne.layers.InputLayer((None, state_length), State)

        concatLayer = lasagne.layers.ConcatLayer(
            [inputLayerA, self._l_outActA])
        l_hid2A = lasagne.layers.DenseLayer(
            concatLayer,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3A = lasagne.layers.DenseLayer(
            l_hid2A,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outA = lasagne.layers.DenseLayer(
            l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((n_out,))

        # self.updateTargetModel()
        inputLayerActB = lasagne.layers.InputLayer((None, state_length), State)
        l_hid2ActB = lasagne.layers.DenseLayer(
            inputLayerActB,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActB = lasagne.layers.DenseLayer(
            l_hid2ActB,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActB = lasagne.layers.DenseLayer(
            l_hid3ActB,
            num_units=n_out,
            nonlinearity=lasagne.nonlinearities.linear)

        inputLayerB = lasagne.layers.InputLayer((None, state_length), State)
        concatLayerB = lasagne.layers.ConcatLayer(
            [inputLayerB, self._l_outActB])
        l_hid2B = lasagne.layers.DenseLayer(
            concatLayerB,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3B = lasagne.layers.DenseLayer(
            l_hid2B,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outB = lasagne.layers.DenseLayer(
            l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear)

        # print "Initial W " + str(self._w_o.get_value())

        self._learning_rate = 0.001
        self._discount_factor = 0.8
        self._rho = 0.95
        self._rms_epsilon = 0.001

        self._weight_update_steps = 5
        self._updates = 0

        self._states_shared = theano.shared(
            np.zeros((batch_size, state_length), dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((batch_size, state_length), dtype=theano.config.floatX))

        self._rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                             broadcastable=(False, True))

        self._actions_shared = theano.shared(
            np.zeros((batch_size, n_out), dtype=theano.config.floatX), )

        self._q_valsActA = lasagne.layers.get_output(self._l_outActA, State)
        self._q_valsActB = lasagne.layers.get_output(self._l_outActB,
                                                     ResultState)
        self._q_valsActB2 = lasagne.layers.get_output(self._l_outActB, State)
        inputs_ = {
            State: self._states_shared,
            Action: self._q_valsActA,
        }
        self._q_valsA = lasagne.layers.get_output(self._l_outA, inputs_)
        inputs_ = {
            ResultState: self._next_states_shared,
            Action: self._q_valsActB,
        }
        self._q_valsB = lasagne.layers.get_output(self._l_outB, inputs_)

        self._q_func = self._q_valsA
        self._q_funcAct = self._q_valsActA
        self._q_funcB = self._q_valsB
        self._q_funcActB = self._q_valsActB

        # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True)

        self._target = (Reward + self._discount_factor * self._q_valsB)
        self._diff = self._target - self._q_valsA
        self._loss = 0.5 * self._diff**2 + (
            1e-4 * lasagne.regularization.regularize_network_params(
                self._l_outA, lasagne.regularization.l2))
        self._loss = T.mean(self._loss)

        self._params = lasagne.layers.helper.get_all_params(self._l_outA)[-6:]
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._l_outActA)
        self._givens_ = {
            State: self._states_shared,
            # ResultState: self._next_states_shared,
            Reward: self._rewards_shared,
            # Action: self._actions_shared,
        }
        self._actGivens = {
            State: self._states_shared,
            # ResultState: self._next_states_shared,
            # Reward: self._rewards_shared,
            # Action: self._actions_shared,
        }

        # SGD update
        #updates_ = lasagne.updates.rmsprop(loss, params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        # TD update
        # minimize Value function error
        self._updates_ = lasagne.updates.rmsprop(
            T.mean(self._q_func) +
            (1e-4 * lasagne.regularization.regularize_network_params(
                self._l_outA, lasagne.regularization.l2)), self._params,
            self._learning_rate * -T.mean(self._diff), self._rho,
            self._rms_epsilon)

        # actDiff1 = (Action - self._q_valsActB) #TODO is this correct?
        # actDiff = (actDiff1 - (Action - self._q_valsActA))
        # actDiff = ((Action - self._q_valsActB2)) # Target network does not work well here?
        #self._actDiff = ((Action - self._q_valsActA)) # Target network does not work well here?
        #self._actLoss = 0.5 * self._actDiff ** 2 + (1e-4 * lasagne.regularization.regularize_network_params( self._l_outActA, lasagne.regularization.l2))
        #self._actLoss = T.mean(self._actLoss)

        # actionUpdates = lasagne.updates.rmsprop(actLoss +
        #    (1e-4 * lasagne.regularization.regularize_network_params(
        #        self._l_outActA, lasagne.regularization.l2)), actionParams,
        #            self._learning_rate * 0.01 * (-actLoss), self._rho, self._rms_epsilon)

        # Maximize wrt q function

        # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO
        actionUpdates = lasagne.updates.rmsprop(
            T.mean(self._q_func) +
            (1e-4 * lasagne.regularization.regularize_network_params(
                self._l_outActA, lasagne.regularization.l2)),
            self._actionParams, self._learning_rate * 0.1, self._rho,
            self._rms_epsilon)

        self._train = theano.function([], [self._loss, self._q_func],
                                      updates=self._updates_,
                                      givens=self._givens_)
        # self._trainActor = theano.function([], [actLoss, self._q_valsActA], updates=actionUpdates, givens=actGivens)
        self._trainActor = theano.function([], [self._q_func],
                                           updates=actionUpdates,
                                           givens=self._actGivens)
        self._q_val = theano.function([],
                                      self._q_valsA,
                                      givens={State: self._states_shared})
        self._q_action = theano.function([],
                                         self._q_valsActA,
                                         givens={State: self._states_shared})
        inputs_ = [
            State,
            Reward,
            # ResultState
        ]
        self._bellman_error = theano.function(inputs=inputs_,
                                              outputs=self._diff,
                                              allow_input_downcast=True)
Example #46
0
 def addParam(self, name, value):
     value = theano.shared(value)
     setattr(self, name, value)
     self.params.append(value)
     self.paramNames.append(name)
Example #47
0
def sharedsfGpu(x,name=None):
    return T.cast( theano.shared(x,name=name), dtype=floatX )
Example #48
0
    def ini_net(self, outputShape, testData, modelSaver):
        self.net = NeuralNet(
            layers=[
                ('input', layers.InputLayer),
                ('conv1', layers.Conv2DLayer),
                ('pool1', layers.MaxPool2DLayer),
                ('dropout1', layers.DropoutLayer),
                ('conv2', layers.Conv2DLayer),
                ('pool2', layers.MaxPool2DLayer),
                ('dropout2', layers.DropoutLayer),
                ('conv3', layers.Conv2DLayer),
                ('pool3', layers.MaxPool2DLayer),
                ('dropout3', layers.DropoutLayer),
                ('hidden4', layers.DenseLayer),
                ('dropout4', layers.DropoutLayer),
                ('hidden5', layers.DenseLayer),
                ('output', layers.DenseLayer),
            ],
            input_shape=(None, Settings.NN_CHANNELS,
                         Settings.NN_INPUT_SHAPE[0], Settings.NN_INPUT_SHAPE[1]
                         ),  # variable batch size, 3 color shape row shape
            conv1_num_filters=32,
            conv1_filter_size=(3, 3),
            pool1_pool_size=(2, 2),
            dropout1_p=0.1,
            conv2_num_filters=64,
            conv2_filter_size=(2, 2),
            pool2_pool_size=(2, 2),
            dropout2_p=0.2,
            conv3_num_filters=128,
            conv3_filter_size=(2, 2),
            pool3_pool_size=(2, 2),
            dropout3_p=0.3,
            hidden4_num_units=500,
            dropout4_p=0.5,
            hidden5_num_units=500,
            output_num_units=outputShape,
            output_nonlinearity=lasagne.nonlinearities.softmax,

            # optimization method:
            update=nesterov_momentum,
            update_learning_rate=theano.shared(
                utils.to_float32(Settings.NN_START_LEARNING_RATE)),
            update_momentum=theano.shared(
                utils.to_float32(Settings.NN_START_MOMENTUM)),
            batch_iterator_train=AugmentingLazyBatchIterator(
                Settings.NN_BATCH_SIZE,
                testData,
                "train",
                False,
                newSegmentation=False,
                loadingSize=(120, 120)),
            batch_iterator_test=LazyBatchIterator(
                Settings.NN_BATCH_SIZE,
                testData,
                "valid",
                False,
                newSegmentation=False,
                loadingInputShape=Settings.NN_INPUT_SHAPE),
            train_split=TrainSplit(
                eval_size=0.0),  # we cross validate on our own
            regression=False,  # classification problem
            on_epoch_finished=[
                AdjustVariable('update_learning_rate',
                               start=Settings.NN_START_LEARNING_RATE,
                               stop=0.0001),
                AdjustVariable('update_momentum',
                               start=Settings.NN_START_MOMENTUM,
                               stop=0.999),
                TrainingHistory("?", str(self), [1], modelSaver),
                EarlyStopping(150),
                modelSaver,
            ],
            max_epochs=Settings.NN_EPOCHS,
            verbose=1,
        )
Example #49
0
import theano_multi

import numpy as np
import theano
import theano.tensor as T

import time

BATCH_SIZE = 512
DIM = 4096

x = T.matrix('x')
y = T.matrix('y')

W = theano.shared(
    np.random.normal(scale=0.01, size=(DIM, DIM)).astype(theano.config.floatX))

y_hat = x
for i in xrange(50):
    y_hat = T.dot(y_hat, W)

cost = T.mean((y_hat - y)**2)

params = [W]

grads = T.grad(cost, params)

# grads = theano_multi.multi(grads, params=params, other_contexts=['dev2', 'dev1'])

updates = [(p, p - 0.1 * g) for p, g in zip(params, grads)]
Example #50
0
def sharedf(x, target=None, name=None,borrow=False):
    if target is None:
        return theano.shared(np.asarray(x,dtype=floatX), name=name, borrow=borrow, )
    else:
        return theano.shared(np.asarray(x,dtype=floatX), target=target, name=name, borrow=borrow)
Example #51
0
import numpy

from theano import function, shared
from theano import tensor as TT
import theano

sharedX = (lambda X, name: shared(numpy.asarray(X, dtype=theano.config.floatX),
                                  name=name))


def kinetic_energy(vel):
    """Returns the kinetic energy associated with the given velocity
    and mass of 1.

    Parameters
    ----------
    vel: theano matrix
        Symbolic matrix whose rows are velocity vectors.

    Returns
    -------
    return: theano vector
        Vector whose i-th entry is the kinetic entry associated with vel[i].

    """
    return 0.5 * (vel**2).sum(axis=1)


def hamiltonian(pos, vel, energy_fn):
    """
    Returns the Hamiltonian (sum of potential and kinetic energy) for the given
Example #52
0
def sharedScalar(x,name=None):
    return theano.shared(x,name=name)
Example #53
0
def main(reps,
         pretrained_w_path,
         do_module1,
         init_seed=0,
         load_t=0,
         num_epochs=200,
         batchsize=96,
         fine_tune=0,
         patience=500,
         lr_init=1e-3,
         optim='adagrad',
         toy=0,
         num_classes=23):
    res_root = '/home/hoa/Desktop/projects/resources'
    X_path = osp.join(res_root, 'datasets/msrcv2/Xaug_b01c.npy')
    Y_path = osp.join(res_root, 'datasets/msrcv2/Y.npy')
    MEAN_IMG_PATH = osp.join(res_root, 'models/ilsvrc_2012_mean.npy')
    snapshot = 50  # save model after every `snapshot` epochs

    drop_p = 0.5  # drop out prob.
    lambda2 = 0.0005 / 2  # l2-regularizer constant
    # step=patience/4 # decay learning after every `step` epochs
    lr_patience = 60  # for learning rate schedule, if optim=='momentum'
    if toy:  # unit testing
        num_epochs = 10
        data_multi = 3
        reps = 2
        #drop_p=0
        #lambda2=0

    # Create name tag for the experiment
    if fine_tune:
        full_or_tune = 'tune'  # description tag for storing associated files
    else:
        full_or_tune = 'full'
    time_stamp = time.strftime("%y%m%d%H%M%S", time.localtime())
    snapshot_root = '../snapshot_models/'
    snapshot_name = str(num_classes) + 'alex' + time_stamp + full_or_tune

    # LOADING DATA
    print 'LOADING DATA ...'
    X = np.load(X_path)
    Y = np.load(Y_path)
    if X.shape[1] != 3:
        X = b01c_to_bc01(X)
    N = len(Y)

    print 'Raw X,Y shape', X.shape, Y.shape
    if len(X) != len(Y):
        print 'Inconsistent number of input images and labels. X is possibly augmented.'

    MEAN_IMG = np.load(MEAN_IMG_PATH)
    MEAN_IMG_227 = skimage.transform.resize(np.swapaxes(
        np.swapaxes(MEAN_IMG, 0, 1), 1, 2), (227, 227),
                                            mode='nearest',
                                            preserve_range=True)
    MEAN_IMG = np.swapaxes(np.swapaxes(MEAN_IMG_227, 1, 2), 0, 1).reshape(
        (1, 3, 227, 227))

    all_metrics = []  # store metrics in each run
    time_profiles = {
        'train_module1': [],
        'train_module1_eff': [],
        'train_module2': [],
        'test': []
    }  # record training and testing time

    # PREPARE THEANO EXPRESSION FOR BOTH MODULES
    print 'COMPILING THEANO EXPRESSION ...'
    input_var = T.tensor4('inputs')
    target_var = T.imatrix('targets')
    network = build_model(num_classes=num_classes, input_var=input_var)

    # Create a loss expression for training
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.binary_crossentropy(prediction, target_var)
    weights = lasagne.layers.get_all_params(network, regularizable=True)
    l2reg = theano.shared(floatX(lambda2)) * T.sum(
        [T.sum(w**2) for w in weights])
    loss = loss.mean() + l2reg

    lr = theano.shared(np.array(lr_init, dtype=theano.config.floatX))
    lr_decay = np.array(1. / 3, dtype=theano.config.floatX)

    # Create update expressions for training
    params = lasagne.layers.get_all_params(network, trainable=True)
    # last-layer case is actually very simple:
    # `params` above is a list of all (W,b)-pairs
    # Therefore last layer's (W,b) is params[-2:]
    if fine_tune == 7:  # tuning params from fc7 to fc8
        params = params[-2:]
    # elif fine_tune == 6: # tuning params from fc6 to fc8
    #     params = params[-4:]
    # TODO adjust for per-layer training with local_lr

    if optim == 'momentum':
        updates = lasagne.updates.nesterov_momentum(loss,
                                                    params,
                                                    learning_rate=lr,
                                                    momentum=0.9)
    elif optim == 'rmsprop':
        updates = lasagne.updates.rmsprop(loss,
                                          params,
                                          learning_rate=lr,
                                          rho=0.9,
                                          epsilon=1e-06)
    elif optim == 'adam':
        updates = lasagne.updates.adam(loss,
                                       params,
                                       learning_rate=lr,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)
    elif optim == 'adagrad':
        updates = lasagne.updates.adagrad(loss,
                                          params,
                                          learning_rate=lr,
                                          epsilon=1e-06)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.binary_crossentropy(test_prediction,
                                                       target_var)
    test_loss = test_loss.mean() + l2reg
    # zero-one loss with threshold t = 0.5 for reference
    # zero_one_loss = T.abs_((test_prediction > theano.shared(floatX(0.5))) - target_var).sum(axis=1)
    #zero_one_loss /= target_var.shape[1].astype(theano.config.floatX)
    #zero_one_loss = zero_one_loss.mean()

    # Compile a function performing a backward pass (training step)  on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    bwd_fn = theano.function(
        [input_var, target_var],
        loss,
        updates=updates,
    )
    # Compile a second function performing a forward pass,
    # returns validation loss, 0/1 Error, score i.e. Xout:
    fwd_fn = theano.function([input_var, target_var], test_loss)

    # Create a theano function for computing score
    score = lasagne.layers.get_output(network, deterministic=True)
    score_fn = theano.function([input_var], score)

    def compute_score(X, Y, batchsize=batchsize, shuffle=False):
        out = np.zeros(Y.shape)
        batch_id = 0
        for batch in iterate_minibatches(X, Y, batchsize, shuffle=False):
            inputs, _ = batch
            # Flip random half of the batch
            flip_idx = np.random.choice(len(inputs),
                                        size=len(inputs) / 2,
                                        replace=False)
            if len(flip_idx) > 1:
                inputs[flip_idx] = inputs[flip_idx, :, :, ::-1]
            # Substract mean image
            inputs = (inputs - MEAN_IMG).astype(theano.config.floatX)
            # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead
            if len(inputs) == batchsize:
                out[batch_id * batchsize:(batch_id + 1) *
                    batchsize] = score_fn(inputs)
                batch_id += 1
            else:
                out[batch_id * batchsize:] = score_fn(inputs)

        return out

    try:
        #  MAIN LOOP FOR EACH RUN
        for seed in np.arange(reps) + init_seed:
            # reset learning rate
            lr.set_value(lr_init)

            print '\nRUN', seed, '...'
            # Split train/val/test set
            indicies = np.arange(len(Y))
            Y_train_val, Y_test, idx_train_val, idx_test = train_test_split(
                Y, indicies, random_state=seed, train_size=float(2) / 3)
            Y_train, Y_val, idx_train, idx_val = train_test_split(
                Y_train_val, idx_train_val, random_state=seed)

            print "Train/val/test set size:", len(idx_train), len(
                idx_val), len(idx_test)

            idx_aug_train = data_aug(idx_train, mode='aug', isMat='idx', N=N)
            Xaug_train = X[idx_aug_train]
            Yaug_train = data_aug(Y_train, mode='aug', isMat='Y', N=N)

            idx_aug_val = data_aug(idx_val, mode='aug', isMat='idx', N=N)
            Xaug_val = X[idx_aug_val]
            Yaug_val = data_aug(Y_val, mode='aug', isMat='Y', N=N)

            # Module 2 training set is composed of module 1 training and validation set
            idx_aug_train_val = data_aug(idx_train_val,
                                         mode='aug',
                                         isMat='idx',
                                         N=N)
            Xaug_train_val = X[idx_aug_train_val]
            Yaug_train_val = data_aug(Y_train_val, mode='aug', isMat='Y', N=N)

            # Test set
            X_test = X[idx_test]
            # Y_test is already returned in the first train_test_split

            print "Augmented train/val/test set size:", len(Xaug_train), len(
                Yaug_val), len(X_test)
            print "Augmented (X,Y) dtype:", Xaug_train.dtype, Yaug_val.dtype
            print "Processed Mean image:", MEAN_IMG.dtype, MEAN_IMG.shape

            if toy:  # try to overfit a tiny subset of the data
                Xaug_train = Xaug_train[:batchsize * data_multi +
                                        batchsize / 2]
                Yaug_train = Yaug_train[:batchsize * data_multi +
                                        batchsize / 2]
                Xaug_val = Xaug_val[:batchsize + batchsize / 2]
                Yaug_val = Yaug_val[:batchsize + batchsize / 2]

            # Init by pre-trained weights, if any
            if len(pretrained_w_path) > 0:
                layer_list = lasagne.layers.get_all_layers(
                    network)  # 22 layers
                if pretrained_w_path.endswith('pkl'):
                    # load reference_net
                    # use case: weights initialized from pre-trained reference nets
                    f = open(pretrained_w_path, 'r')
                    w_list = pickle.load(f)  # list of 11 (W,b)-pairs
                    f.close()

                    lasagne.layers.set_all_param_values(
                        layer_list[-3], w_list[:-2])
                    # exclude (W,b) of fc8
                    # BIG NOTE: don't be confused, it's pure coincident that layer_list
                    # and w_list have the same index here. The last element of layer_list are
                    # [.., fc6, drop6, fc7, drop7, fc8], while w_list are
                    # [..., W, b, W, b, W, b] which, eg w_list[-4] and w_list[-3] correspond to
                    # params that are associated with fc7 i.e. params that connect drop6 to fc7

                elif pretrained_w_path.endswith('npz'):
                    # load self-trained net
                    # use case: continue training from a snapshot model
                    with np.load(
                            pretrained_w_path
                    ) as f:  # NOTE: only load snapshot of the same `seed`
                        # w_list = [f['arr_%d' % i] for i in range(len(f.files))]
                        w_list = [
                            f.items()['arr_%d' % i]
                            for i in range(len(f.files))
                        ]  # load from bkviz, one-time use
                    lasagne.layers.set_all_param_values(network, w_list)

                elif pretrained_w_path.endswith(
                        '/'):  # init from 1 of the 30 snapshots
                    from os import listdir
                    import re
                    files = [
                        f for f in listdir(pretrained_w_path)
                        if osp.isfile(osp.join(pretrained_w_path, f))
                    ]
                    for file_name in files:
                        regex_seed = 'full%d_' % seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            print snapshot_name
                            with np.load(
                                    osp.join(pretrained_w_path, snapshot_name)
                                    + '.npz') as f:
                                w_list = [
                                    f['arr_%d' % i]
                                    for i in range(len(f.files))
                                ]
                            lasagne.layers.set_all_param_values(
                                network, w_list)

            # START MODULE 1
            module1_time = 0
            if do_module1:
                print 'MODULE 1'
                training_history = {}
                training_history['iter_training_loss'] = []
                training_history['iter_validation_loss'] = []
                training_history['training_loss'] = []
                training_history['validation_loss'] = []
                training_history['learning_rate'] = []

                # http://deeplearning.net/tutorial/gettingstarted.html#early-stopping
                # early-stopping parameters
                n_train_batches = Xaug_train.shape[0] / batchsize
                if Xaug_train.shape[0] % batchsize != 0:
                    n_train_batches += 1
                patience = patience  # look as this many examples regardless
                patience_increase = 2  # wait this much longer when a new best is found
                lr_patience_increase = 1.01
                improvement_threshold = 0.995  # a relative improvement of this much is
                # considered significant; a significant test
                # MIGHT be better
                validation_frequency = min(n_train_batches, patience / 2)
                # go through this many
                # minibatches before checking the network
                # on the validation set; in this case we
                # check every epoch
                best_params = None
                epoch_validation_loss = 0  # indicates that valid_loss has not been computed yet
                best_validation_loss = np.inf
                best_iter = -1
                lr_iter = -1
                test_score = 0.
                start_time = time.time()
                done_looping = False
                epoch = 0

                # Finally, launch the training loop.
                print("Starting training...")
                # We iterate over epochs:
                print(
                    "\nEpoch\tTrain Loss\tValid Loss\tBest-ValLoss-and-Iter\tTime\tL.Rate"
                )
                sys.setrecursionlimit(10000)

                try:  # Early-stopping implementation
                    while (not done_looping) and (epoch < num_epochs):
                        # In each epoch, we do a full pass over the training data:
                        train_err = 0
                        train_batches = 0
                        start_time = time.time()
                        for batch in iterate_minibatches(Xaug_train,
                                                         Yaug_train,
                                                         batchsize,
                                                         shuffle=True):
                            inputs, targets = batch
                            # Horizontal flip half of the images
                            bs = inputs.shape[0]
                            indices = np.random.choice(bs,
                                                       bs / 2,
                                                       replace=False)
                            inputs[indices] = inputs[indices, :, :, ::-1]

                            # Substract mean image
                            inputs = (inputs - MEAN_IMG).astype(
                                theano.config.floatX)
                            # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead

                            train_err_batch = bwd_fn(inputs, targets)
                            train_err += train_err_batch
                            train_batches += 1

                            iter_now = epoch * n_train_batches + train_batches
                            training_history['iter_training_loss'].append(
                                train_err_batch)
                            training_history['iter_validation_loss'].append(
                                epoch_validation_loss)

                            if (iter_now + 1) % validation_frequency == 0:
                                # a full pass over the validation data:
                                val_err = 0
                                #zero_one_err = 0
                                val_batches = 0
                                for batch in iterate_minibatches(
                                        Xaug_val,
                                        Yaug_val,
                                        batchsize,
                                        shuffle=False):
                                    inputs, targets = batch
                                    # Substract mean image
                                    inputs = (inputs - MEAN_IMG).astype(
                                        theano.config.floatX)
                                    # MEAN_IMG is broadcasted numpy-way, take note if want theano expression instead

                                    val_err_batch = fwd_fn(inputs, targets)
                                    val_err += val_err_batch
                                    val_batches += 1
                                epoch_validation_loss = val_err / val_batches
                                if epoch_validation_loss < best_validation_loss:
                                    if epoch_validation_loss < best_validation_loss * improvement_threshold:
                                        patience = max(
                                            patience,
                                            iter_now * patience_increase)
                                        # lr_patience *= lr_patience_increase

                                    best_params = lasagne.layers.get_all_param_values(
                                        network)
                                    best_validation_loss = epoch_validation_loss
                                    best_iter = iter_now
                                    lr_iter = best_iter

                                else:  # decay learning rate if optim=='momentum'
                                    if optim == 'momentum' and (
                                            iter_now - lr_iter) > lr_patience:
                                        lr.set_value(lr.get_value() * lr_decay)
                                        lr_iter = iter_now

                            if patience <= iter_now:
                                done_looping = True
                                break

                        # Record training history
                        training_history['training_loss'].append(train_err /
                                                                 train_batches)
                        training_history['validation_loss'].append(
                            epoch_validation_loss)
                        training_history['learning_rate'].append(
                            lr.get_value())

                        epoch_time = time.time() - start_time
                        module1_time += epoch_time
                        # Then we print the results for this epoch:
                        print("{}\t{:.6f}\t{:.6f}\t{:.6f}\t{}\t{:.3f}\t{}".
                              format(epoch + 1,
                                     training_history['training_loss'][-1],
                                     training_history['validation_loss'][-1],
                                     best_validation_loss, best_iter + 1,
                                     epoch_time,
                                     training_history['learning_rate'][-1]))

                        if (
                                epoch + 1
                        ) % snapshot == 0:  # TODO try to save weights at best_iter
                            snapshot_path_string = snapshot_root + snapshot_name + str(
                                seed) + '_' + str(iter_now + 1)
                            try:  # use case: terminate experiment before reaching `reps`
                                np.savez(snapshot_path_string + '.npz',
                                         *best_params)
                                np.savez(snapshot_path_string + '_history.npz',
                                         training_history)
                                plot_loss(training_history,
                                          snapshot_path_string + '_loss.png')
                                # plot_conv_weights(lasagne.layers.get_all_layers(network)[1],
                                #     snapshot_path_string+'_conv1weights_')
                            except KeyboardInterrupt, TypeError:
                                print 'Did not save', snapshot_name + str(
                                    seed) + '_' + str(iter_now + 1)
                                pass

                        epoch += 1

                except KeyboardInterrupt, MemoryError:  # Sadly this can only catch KeyboardInterrupt
                    pass
                print 'Training finished or KeyboardInterrupt (Training is never finished, only abandoned)'

                module1_time_eff = module1_time / iter_now * best_iter
                print('Total and Effective training time are {:.0f} and {:.0f}'
                      ).format(module1_time, module1_time_eff)
                time_profiles['train_module1'].append(module1_time)
                time_profiles['train_module1_eff'].append(module1_time_eff)

                # Save model after num_epochs or KeyboardInterrupt
                if (epoch + 1) % snapshot != 0:  # to avoid duplicate save
                    snapshot_path_string = snapshot_root + snapshot_name + str(
                        seed) + '_' + str(iter_now + 1)
                    if not toy:
                        try:  # use case: terminate experiment before reaching `reps`
                            print 'Saving model...'
                            np.savez(snapshot_path_string + '.npz',
                                     *best_params)
                            np.savez(snapshot_path_string + '_history.npz',
                                     training_history)
                            plot_loss(training_history,
                                      snapshot_path_string + '_loss.png')
                            # plot_conv_weights(lasagne.layers.get_all_layers(network)[1],
                            #     snapshot_path_string+'_conv1weights_')
                        except KeyboardInterrupt, TypeError:
                            print 'Did not save', snapshot_name + str(
                                seed) + '_' + str(iter_now + 1)
                            pass
                # And load them again later on like this:
                #with np.load('../snapshot_models/23alex16042023213910.npz') as f:
                #    param_values = [f['arr_%d' % i] for i in range(len(f.files))] # or
                #    training_history = f['arr_0'].items()
                # lasagne.layers.set_all_param_values(network, param_values)

            # END OF MODULE 1

            # START MODULE 2
            print '\nMODULE 2'
            if not do_module1:
                if pretrained_w_path.endswith('pkl'):
                    snapshot_name = str(
                        num_classes
                    ) + 'alexOTS'  # short for "off-the-shelf init"

                elif pretrained_w_path.endswith(
                        'npz'):  # Resume from a SINGLE snapshot
                    # extract name pattern, e.g. '23alex16042023213910full10'
                    # from string '../snapshot_models/23alex16042023213910full10_100.npz'
                    import re
                    regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+"
                    match = re.search(regex, pretrained_w_path)
                    snapshot_name = match.group(0)

                elif pretrained_w_path.endswith(
                        '/'):  # RESUMED FROM TRAINED MODULE 1 (ONE-TIME USE)
                    from os import listdir
                    import re
                    files = [
                        f for f in listdir(pretrained_w_path)
                        if osp.isfile(osp.join(pretrained_w_path, f))
                    ]
                    for file_name in files:
                        regex_seed = 'full%d_' % seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            print snapshot_name
                            with np.load(
                                    osp.join(pretrained_w_path, snapshot_name)
                                    + '.npz') as f:
                                w_list = [
                                    f['arr_%d' % i]
                                    for i in range(len(f.files))
                                ]
                            lasagne.layers.set_all_param_values(
                                network, w_list)

            else:  # MAIN BRANCH - assume do_module1 is True AND have run `snapshot` epochs
                if (epoch + 1) > snapshot:
                    with np.load(snapshot_path_string + '.npz'
                                 ) as f:  # reload the best params for module 1
                        w_list = [f['arr_%d' % i] for i in range(len(f.files))]
                    lasagne.layers.set_all_param_values(network, w_list)

            score_train = compute_score(Xaug_train_val, Yaug_train_val)
            start_time = time.time()

            if load_t:  # Server failed at the wrong time. We only have t backed-up
                if pretrained_w_path.endswith('/'):
                    from os import listdir
                    import re
                    files = [
                        f for f in listdir(pretrained_w_path)
                        if osp.isfile(osp.join(pretrained_w_path, f))
                    ]
                    for file_name in files:
                        regex_seed = 'full%d_' % seed
                        match_seed = re.search(regex_seed, file_name)
                        if match_seed:
                            regex = r"\d+[a-zA-Z]+\d+[a-zA-Z]+\d+\_\d+"
                            match = re.search(regex, file_name)
                            snapshot_name = match.group(0)
                            t_train = np.load(
                                osp.join('t', '{0}.npy'.format(snapshot_name)))

            else:  # MAIN BRANCH
                thresholds = Threshold(score_train, Yaug_train_val)
                thresholds.find_t_for(
                )  # determine t_train for each score_train. It will take a while
                t_train = np.asarray(thresholds.t)
                print 't_train is in ', t_train.min(), '..', t_train.max()
                # `thresholds` holds t_train vector in .t attribute
                print('t_train produced in {:.3f}s').format(time.time() -
                                                            start_time)
                np.save('t/' + snapshot_name + str(seed) + '.npy', t_train)

            # Predictive model for t
            regr = linear_model.RidgeCV(cv=5)
            # Ridge() is LinearClassifier() with L2-reg
            regr.fit(score_train, t_train)

            time_profiles['train_module2'].append(time.time() - start_time)
            # END OF MODULE 2

            # TESTING PHASE
            start_time = time.time()
            score_test = compute_score(X_test, Y_test)
            t_test = regr.predict(score_test)
            print 'original t_test is in ', min(t_test), '..', max(t_test)
            t_test[t_test > 1] = max(t_test[t_test < 1])
            t_test[t_test < 0] = min(
                t_test[t_test > 0])  # ! Keep t_test in [0,1]
            print 'corrected t_test is in ', min(t_test), '..', max(t_test)

            # Predict label
            metrics = predict_label(score_test,
                                    Y_test,
                                    t_test,
                                    seed,
                                    num_classes,
                                    verbose=1)
            time_profiles['test'].append(time.time() - start_time)

            all_metrics.append(metrics)

l4a = layers.DenseLayer(j3, n_outputs=4096, weights_std=0.001, init_bias_value=0.01, dropout=0.5, nonlinearity=layers.identity)
l4b = layers.FeatureMaxPoolingLayer(l4a, pool_size=2, feature_dim=1, implementation='reshape')
l4c = layers.DenseLayer(l4b, n_outputs=4096, weights_std=0.001, init_bias_value=0.01, dropout=0.5, nonlinearity=layers.identity)
l4 = layers.FeatureMaxPoolingLayer(l4c, pool_size=2, feature_dim=1, implementation='reshape')

# l5 = layers.DenseLayer(l4, n_outputs=37, weights_std=0.01, init_bias_value=0.0, dropout=0.5, nonlinearity=custom.clip_01) #  nonlinearity=layers.identity)
l5 = layers.DenseLayer(l4, n_outputs=37, weights_std=0.01, init_bias_value=0.1, dropout=0.5, nonlinearity=layers.identity)

# l6 = layers.OutputLayer(l5, error_measure='mse')
l6 = custom.OptimisedDivGalaxyOutputLayer(l5) # this incorporates the constraints on the output (probabilities sum to one, weighting, etc.)



xs_shared = [theano.shared(np.zeros((1,1,1,1), dtype=theano.config.floatX)) for _ in xrange(num_input_representations)]

idx = T.lscalar('idx')

givens = {
    l0.input_var: xs_shared[0][idx*BATCH_SIZE:(idx+1)*BATCH_SIZE],
    l0_45.input_var: xs_shared[1][idx*BATCH_SIZE:(idx+1)*BATCH_SIZE],
}

compute_output = theano.function([idx], l6.predictions(dropout_active=False), givens=givens)


print "Load model parameters"
layers.set_param_values(l6, analysis['param_values'])

print "Create generators"
Example #55
0
def init_params_c2w2s(n_chars=N_CHAR):
    '''
    Initialize all params for hierarchical GRU
    '''
    params = OrderedDict()

    np.random.seed(0)

    prefix = 'c2w_'

    # lookup table
    params[prefix+'Wc'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(n_chars,CHAR_DIM)).astype('float32'), name=prefix+'Wc')

    # f-GRU
    params[prefix+'W_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_f_r')
    params[prefix+'W_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_f_z')
    params[prefix+'W_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_f_h')
    params[prefix+'b_f_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_f_r')
    params[prefix+'b_f_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_f_z')
    params[prefix+'b_f_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_f_h')
    params[prefix+'U_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_f_r')
    params[prefix+'U_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_f_z')
    params[prefix+'U_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_f_h')

    # b-GRU
    params[prefix+'W_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_b_r')
    params[prefix+'W_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_b_z')
    params[prefix+'W_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name=prefix+'W_b_h')
    params[prefix+'b_b_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_b_r')
    params[prefix+'b_b_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_b_z')
    params[prefix+'b_b_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name=prefix+'b_b_h')
    params[prefix+'U_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_b_r')
    params[prefix+'U_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_b_z')
    params[prefix+'U_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name=prefix+'U_b_h')

    # dense
    params[prefix+'W_df'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name=prefix+'W_df')
    params[prefix+'W_db'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name=prefix+'W_db')
    #params[prefix+'b_df'] = theano.shared(np.zeros((WDIM)).astype('float32'), name=prefix+'b_df')
    #params[prefix+'b_db'] = theano.shared(np.zeros((WDIM)).astype('float32'), name=prefix+'b_db')

    prefix = 'w2s_'

    # f-GRU
    params[prefix+'W_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_f_r')
    params[prefix+'W_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_f_z')
    params[prefix+'W_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_f_h')
    params[prefix+'b_f_r'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_f_r')
    params[prefix+'b_f_z'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_f_z')
    params[prefix+'b_f_h'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_f_h')
    params[prefix+'U_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_f_r')
    params[prefix+'U_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_f_z')
    params[prefix+'U_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_f_h')

    # b-GRU
    params[prefix+'W_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_b_r')
    params[prefix+'W_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_b_z')
    params[prefix+'W_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(WDIM,W2S_HDIM)).astype('float32'), name=prefix+'W_b_h')
    params[prefix+'b_b_r'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_b_r')
    params[prefix+'b_b_z'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_b_z')
    params[prefix+'b_b_h'] = theano.shared(np.zeros((W2S_HDIM)).astype('float32'), name=prefix+'b_b_h')
    params[prefix+'U_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_b_r')
    params[prefix+'U_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_b_z')
    params[prefix+'U_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,W2S_HDIM)).astype('float32'), name=prefix+'U_b_h')

    # dense
    params[prefix+'W_df'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,SDIM)).astype('float32'), name=prefix+'W_df')
    params[prefix+'W_db'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(W2S_HDIM,SDIM)).astype('float32'), name=prefix+'W_db')
    #params[prefix+'b_df'] = theano.shared(np.zeros((SDIM)).astype('float32'), name=prefix+'b_df')
    #params[prefix+'b_db'] = theano.shared(np.zeros((SDIM)).astype('float32'), name=prefix+'b_db')

    return params
FREQ_DICT['wrestle'] = 7*112.5
FREQ_DICT['resonate'] = 5*112.5
FREQ_DICT['seated'] = 3*112.5
FREQ_DICT['habitually'] = 1*112.5

ORDERED_FREQ = sorted(list(FREQ_DICT), key=lambda x:FREQ_DICT[x], reverse=True)

def time_freq(freq):
    rehearsals = np.zeros((np.int(np.max(freq) * 113), len(freq)))
    for i in np.arange(len(freq)):
        temp = np.arange(np.int((freq[i]*112.5)))
        temp = temp * np.int(SEC_IN_TIME/(freq[i]*112.5))
        rehearsals[:len(temp),i] = temp
    return(rehearsals.T)

time = theano.shared(time_freq(FREQ), 'time')

LEMMA_CHUNKS = [(actr.makechunk("", typename="word", form=word))
                for word in ORDERED_FREQ]
lex_decision.set_decmem({x: np.array([]) for x in LEMMA_CHUNKS})

lex_decision.goals = {}
lex_decision.set_goal("g")

lex_decision.productionstring(name="attend word", string="""
    =g>
    isa     goal
    state   'attend'
    =visual_location>
    isa    _visuallocation
    ?visual>
    def __init__(self,
                 input=None,
                 n_visible=784,
                 n_hidden=500,
                 W=None,
                 hbias=None,
                 vbias=None,
                 numpy_rng=None,
                 theano_rng=None):
        """ RBM initialization function

        Defines the parameters of the model along with
        basic operations for inferring hidden from visible (and vice-versa),
        as well as for performing Contrastive Divergence updates.

        :param input: None for standalone RBMs or symbolic variable if RBM is
        part of a larger graph.

        :param n_visible: number of visible units

        :param n_hidden: number of hidden units

        :param W: None for standalone RBMs or symbolic variable pointing to a
        shared weight matrix in case RBM is part of a DBN network; in a DBN,
        the weights are shared between RBMs and layers of a MLP

        :param hbias: None for standalone RBMs or symbolic variable pointing
        to a shared hidden units bias vector in case RBM is part of a
        different network

        :param vbias: None for standalone RBMs or a symbolic variable
        pointing to a shared visible units bias
        """
        self.n_visible = n_visible
        self.n_hidden = n_hidden

        if numpy_rng is None:
            numpy_rng = numpy.random.RandomState(1234)

        if theano_rng is None:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        if W is None:
            initial_W = numpy.asarray(numpy_rng.uniform(
                low=-4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                high=4 * numpy.sqrt(6. / (n_hidden + n_visible)),
                size=(n_visible, n_hidden)),
                                      dtype=theano.config.floatX)
            W = theano.shared(value=initial_W, name='W', borrow=True)

        if hbias is None:
            hbias = theano.shared(value=numpy.zeros(
                n_hidden, dtype=theano.config.floatX),
                                  name='hbias',
                                  borrow=True)

        if vbias is None:
            vbias = theano.shared(value=numpy.zeros(
                n_visible, dtype=theano.config.floatX),
                                  name='vbias',
                                  borrow=True)

        self.input = input
        if not input:
            self.input = T.matrix('input')

        self.W = W
        self.hbias = hbias
        self.vbias = vbias
        self.theano_rng = theano_rng
        self.params = [self.W, self.hbias, self.vbias]
Example #58
0
def init_params(n_chars=N_CHAR):
    '''
    Initialize all params
    '''
    params = OrderedDict()

    np.random.seed(0)

    # lookup table
    params['Wc'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(n_chars,CHAR_DIM)).astype('float32'), name='Wc')

    # f-GRU
    params['W_c2w_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_r')
    params['W_c2w_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_z')
    params['W_c2w_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_f_h')
    params['b_c2w_f_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_r')
    params['b_c2w_f_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_z')
    params['b_c2w_f_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_f_h')
    params['U_c2w_f_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_r')
    params['U_c2w_f_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_z')
    params['U_c2w_f_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_f_h')

    # b-GRU
    params['W_c2w_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_r')
    params['W_c2w_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_z')
    params['W_c2w_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(CHAR_DIM,C2W_HDIM)).astype('float32'), name='W_c2w_b_h')
    params['b_c2w_b_r'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_r')
    params['b_c2w_b_z'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_z')
    params['b_c2w_b_h'] = theano.shared(np.zeros((C2W_HDIM)).astype('float32'), name='b_c2w_b_h')
    params['U_c2w_b_r'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_r')
    params['U_c2w_b_z'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_z')
    params['U_c2w_b_h'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,C2W_HDIM)).astype('float32'), name='U_c2w_b_h')

    # dense
    params['W_c2w_df'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name='W_c2w_df')
    params['W_c2w_db'] = theano.shared(np.random.normal(loc=0., scale=SCALE, size=(C2W_HDIM,WDIM)).astype('float32'), name='W_c2w_db')
    params['b_c2w_df'] = theano.shared(np.zeros((WDIM)).astype('float32'), name='b_c2w_df')
    params['b_c2w_db'] = theano.shared(np.zeros((WDIM)).astype('float32'), name='b_c2w_db')

    return params
feats = 7                               # number of input variables

# generate a dataset: D = (input_values, target_class)
D = (rng.randn(N, feats), 1 + rng.randn(N))
training_steps = 100

# Declare Theano symbolic variables
x = T.dmatrix("x")
y = T.dvector("y")

# initialize the weight vector w randomly
#
# this and the following bias variable b
# are shared so they keep their values
# between training iterations (updates)
w = theano.shared(rng.randn(feats), name="w")

# initialize the bias term
b = theano.shared(0., name="b")

print("Initial model:")
print(w.get_value())
print(b.get_value())

# Construct Theano expression graph
y_pred = T.dot(x,w) - b
prediction = y_pred
cost = T.mean(T.sqr(y_pred - y))
gw, gb = T.grad(cost, [w, b])             # Compute the gradient of the cost
                                          # w.r.t weight vector w and
                                          # bias term b
Example #60
0
    def __init__(self,
                 input,
                 n_in,
                 n_hidden,
                 n_out,
                 activation=T.tanh,
                 output_type='real'):

        self.input = input
        self.activation = activation
        self.output_type = output_type

        self.batch_size = T.iscalar()

        # theta is a vector of all trainable parameters
        # it represents the value of W, W_in, W_out, h0, bh, by
        theta_shape = n_hidden ** 2 + n_in * n_hidden + n_hidden * n_out + \
                      n_hidden + n_hidden + n_out
        self.theta = theano.shared(
            value=np.zeros(theta_shape, dtype=theano.config.floatX))

        # Parameters are reshaped views of theta
        param_idx = 0  # pointer to somewhere along parameter vector

        # recurrent weights as a shared variable
        self.W = self.theta[param_idx:(param_idx + n_hidden**2)].reshape(
            (n_hidden, n_hidden))
        self.W.name = 'W'
        W_init = np.asarray(np.random.uniform(size=(n_hidden, n_hidden),
                                              low=-0.01,
                                              high=0.01),
                            dtype=theano.config.floatX)
        param_idx += n_hidden**2

        # input to hidden layer weights
        self.W_in = self.theta[param_idx:(param_idx + n_in * \
                                          n_hidden)].reshape((n_in, n_hidden))
        self.W_in.name = 'W_in'
        W_in_init = np.asarray(np.random.uniform(size=(n_in, n_hidden),
                                                 low=-0.01,
                                                 high=0.01),
                               dtype=theano.config.floatX)
        param_idx += n_in * n_hidden

        # hidden to output layer weights
        self.W_out = self.theta[param_idx:(param_idx + n_hidden * \
                                           n_out)].reshape((n_hidden, n_out))
        self.W_out.name = 'W_out'

        W_out_init = np.asarray(np.random.uniform(size=(n_hidden, n_out),
                                                  low=-0.01,
                                                  high=0.01),
                                dtype=theano.config.floatX)
        param_idx += n_hidden * n_out

        self.h0 = self.theta[param_idx:(param_idx + n_hidden)]
        self.h0.name = 'h0'
        h0_init = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        param_idx += n_hidden

        self.bh = self.theta[param_idx:(param_idx + n_hidden)]
        self.bh.name = 'bh'
        bh_init = np.zeros((n_hidden, ), dtype=theano.config.floatX)
        param_idx += n_hidden

        self.by = self.theta[param_idx:(param_idx + n_out)]
        self.by.name = 'by'
        by_init = np.zeros((n_out, ), dtype=theano.config.floatX)
        param_idx += n_out

        assert (param_idx == theta_shape)

        # for convenience
        self.params = [
            self.W, self.W_in, self.W_out, self.h0, self.bh, self.by
        ]

        # shortcut to norms (for monitoring)
        self.l2_norms = {}
        for param in self.params:
            self.l2_norms[param] = T.sqrt(T.sum(param**2))

        # initialize parameters
        # DEBUG_MODE gives division by zero error when we leave parameters
        # as zeros
        self.theta.set_value(
            np.concatenate([
                x.ravel() for x in (W_init, W_in_init, W_out_init, h0_init,
                                    bh_init, by_init)
            ]))

        self.theta_update = theano.shared(
            value=np.zeros(theta_shape, dtype=theano.config.floatX))

        # recurrent function (using tanh activation function) and arbitrary output
        # activation function
        def step(x_t, h_tm1):
            h_t = self.activation(T.dot(x_t, self.W_in) + \
                                  T.dot(h_tm1, self.W) + self.bh)
            y_t = T.dot(h_t, self.W_out) + self.by
            return h_t, y_t

        # the hidden state `h` for the entire sequence, and the output for the
        # entire sequence `y` (first dimension is always time)
        # Note the implementation of weight-sharing h0 across variable-size
        # batches using T.ones multiplying h0
        # Alternatively, T.alloc approach is more robust
        [self.h,
         self.y_pred], _ = theano.scan(step,
                                       sequences=self.input,
                                       outputs_info=[
                                           T.alloc(self.h0,
                                                   self.input.shape[1],
                                                   n_hidden), None
                                       ])
        # outputs_info=[T.ones(shape=(self.input.shape[1],
        # self.h0.shape[0])) * self.h0, None])

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = 0
        self.L1 += abs(self.W.sum())
        self.L1 += abs(self.W_in.sum())
        self.L1 += abs(self.W_out.sum())

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = 0
        self.L2_sqr += (self.W**2).sum()
        self.L2_sqr += (self.W_in**2).sum()
        self.L2_sqr += (self.W_out**2).sum()

        if self.output_type == 'real':
            self.loss = lambda y: self.mse(y)
        elif self.output_type == 'binary':
            # push through sigmoid
            self.p_y_given_x = T.nnet.sigmoid(self.y_pred[-1])  # apply sigmoid
            self.y_out = T.round(self.p_y_given_x)  # round to {0,1}
            self.loss = lambda y: self.nll_binary(y)
        elif self.output_type == 'softmax':
            # push through softmax, computing vector of class-membership
            # probabilities in symbolic form
            #
            # T.nnet.softmax will not operate on T.tensor3 types, only matrices
            # We take our n_steps x n_seq x n_classes output from the net
            # and reshape it into a (n_steps * n_seq) x n_classes matrix
            # apply softmax, then reshape back
            y_p = self.y_pred
            y_p_m = T.reshape(y_p, (y_p.shape[0] * y_p.shape[1], -1))
            y_p_s = T.nnet.softmax(y_p_m)
            self.p_y_given_x = T.reshape(y_p_s, y_p.shape)

            # compute prediction as class whose probability is maximal
            self.y_out = T.argmax(self.p_y_given_x, axis=-1)
            self.loss = lambda y: self.nll_multiclass(y)

        else:
            raise NotImplementedError