Exemple #1
0
 def __init__(self,dataset,dictionary_size=50000,embedding_size=50,skip_window=5,learning_rate=0.1,negative_sampling=25):
     self.ds = dictionary_size
     self.es = embedding_size
     self.sw = skip_window
     self.lr = learning_rate
     self.ns = negative_sampling
     self._tokenize(dataset)
     
     #nn architecture
     self.input = T.matrix()
     self.w1 = theano.shared((np.random.rand(self.ds,self.es).astype(theano.config.floatX)-0.5),borrow=True)
     self.activeidx = T.ivector()
     self.activew1 = T.take(self.w1, self.activeidx, axis=0)
     self.l1out = T.dot(self.input,self.activew1)
     self.w2 = theano.shared((np.random.rand(self.es,self.ds).astype(theano.config.floatX)-0.5),borrow=True)
     self.sampidx = T.ivector()
     self.sampw2 = T.take(self.w2, self.sampidx, axis=1)
     self.l2out = T.nnet.softmax(T.dot(self.l1out,self.sampw2))
     self.target = T.matrix()
    
     #nn functions
     self.z = (self.l2out - self.target).T
     self.w1update = T.set_subtensor(self.w1[self.activeidx,:], self.w1[self.activeidx,:] - T.dot(self.sampw2, self.z).flatten()*self.lr)
     self.w2update = T.set_subtensor(self.w2[:,self.sampidx], self.w2[:,self.sampidx] - T.outer(self.z, self.l1out).T*self.lr)
     self.propogate = theano.function([self.input,self.target,self.activeidx,self.sampidx],\
         updates = [(self.w1,self.w1update),(self.w2,self.w2update)],allow_input_downcast=True)
Exemple #2
0
    def init_function(self):
        self.seq_idx = T.lvector() 
        self.tar_scalar = T.lscalar()
        self.solution = T.matrix()
        self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0)
        self.tar_vector = T.take(self.Va, self.tar_scalar, axis=0)

        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(self.bc, dtype=theano.config.floatX)

        def encode(x_t, h_fore, c_fore):
            v = T.concatenate([h_fore, x_t])
            f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
            i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
            o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
            c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
            h_next = o_t * T.tanh(c_next)
            return h_next, c_next

        scan_result, _ = theano.scan(fn=encode, sequences=[self.seq_matrix], outputs_info=[h, c])
        embedding = scan_result[0][-1] # embedding in there is a matrix, include[h_1, ..., h_n]

 
        # dropout
        embedding_for_train = embedding * self.srng.binomial(embedding.shape, p = 0.5, n = 1, dtype=embedding.dtype)
        embedding_for_test = embedding * 0.5
            
        self.pred_for_train = T.nnet.softmax(T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param**2) for param in self.params]) - T.sum(self.Vw**2)
        self.loss_sen = -T.tensordot(self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.7 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                    dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        self.func_train = theano.function(
                inputs = [self.seq_idx, self.tar_scalar, self.solution, theano.In(h, value=self.h0), theano.In(c, value=self.c0)],
                outputs = [self.loss, self.loss_sen, self.loss_l2],
                updates = self.updates,
                on_unused_input='warn')

        self.func_test = theano.function(
                inputs = [self.seq_idx, self.tar_scalar, theano.In(h, value=self.h0), theano.In(c, value=self.c0)],
                outputs = self.pred_for_test,
                on_unused_input='warn')
Exemple #3
0
  def evaluate(self, O, Y, N):
    
    # w is the next word in the training data
    pw = O[np.arange(0, self.batch_size), Y]
    qw = self.noise_dist[Y]
    # wb is the noise word in the noise samples
    pwb = T.take(O, N) # (noise_sample_size, )
    qwb = T.take(self.noise_dist, N) # (noise_sample_size, )
    
    # P(D = 1 | c, w)
    pd1 = pw / (pw + self.noise_sample_size * qw) # (batch_size, )
    # P(D = 0 | c, wb)
    pd0 = (self.noise_sample_size * qwb) / (pwb + self.noise_sample_size * qwb) # (noise_sample_size, )

    return T.sum(T.log(pd1) + T.sum(T.log(pd0))) # scalar
Exemple #4
0
    def symbolic_call(self,x,u):

        u = TT.clip(u, -self.max_force, self.max_force) #pylint: disable=E1111

        dt = self.dt

        z = TT.take(x,0,axis=x.ndim-1)
        zdot = TT.take(x,1,axis=x.ndim-1)    
        th = TT.take(x,2,axis=x.ndim-1)
        thdot = TT.take(x,3,axis=x.ndim-1)
        u0 = TT.take(u,0,axis=u.ndim-1)

        th1 = np.pi - th

        g = 10.
        mc = 1. # mass of cart
        mp = .1 # mass of pole
        muc = .0005 # coeff friction of cart
        mup = .000002 # coeff friction of pole
        l = 1. # length of pole

        def sign(x):
            return TT.switch(x>0, 1, -1)

        thddot = -(-g*TT.sin(th1)
         + TT.cos(th1) * (-u0 - mp * l *thdot**2 * TT.sin(th1) + muc*sign(zdot))/(mc+mp)
          - mup*thdot / (mp*l))  \
        / (l*(4/3. - mp*TT.cos(th1)**2 / (mc + mp)))
        zddot = (u0 + mp*l*(thdot**2 * TT.sin(th1) - thddot * TT.cos(th1)) - muc*sign(zdot))  \
            / (mc+mp)

        newzdot = zdot + dt*zddot
        newz = z + dt*newzdot
        newthdot = thdot + dt*thddot
        newth = th + dt*newthdot

        done = (z > self.max_cart_pos) | (z < -self.max_cart_pos) | (th > self.max_pole_angle) | (th < -self.max_pole_angle) 

        ucost = 1e-5*(u**2).sum(axis=u.ndim-1)
        xcost = 1-TT.cos(th)
        # notdone = TT.neg(done) #pylint: disable=W0612,E1111
        notdone = 1-done
        costs = TT.stack((done-1)*10., notdone*xcost, notdone*ucost).T #pylint: disable=E1103


        newx = TT.stack(newz, newzdot, newth, newthdot).T #pylint: disable=E1103

        return [newx,newx,costs,done]
Exemple #5
0
    def _fetch_class_embeddings(self, indices):
        assert indices is not None

        # For the single positive example, indices is a vector of
        # just (batch_size,), when using negative sampling, it is a
        # two-dimensional tensor of size (batch_size, num_neg_samples).
        #
        # target_embeddings is a tensor of size
        # (batch_size, num_candidates, embedding_size)
        target_embeddings = T.take(
            self.entity_representations, indices, axis=0)

        if indices.ndim == 1:
            # This reshapes to (batch_size,
            #                   num_candidates=1,
            #                   embedding_size).
            target_embeddings = target_embeddings.dimshuffle(0, 'x', 1)

        assert target_embeddings.ndim == 3

        # At this stage, target_embeddings is a tensor of size
        # (batch_size, num_candidates, embedding_size) for all cases.
        target_embeddings = target_embeddings.dimshuffle(0, 'x', 1, 2)

        # Now, target_embeddings is of size
        # (batch_size, window_size=1, num_candidates, embedding_size).
        assert target_embeddings.ndim == 4

        return target_embeddings
 def absolute(self, class1):
     # We have to figure out how to transform class1 into a theano variable, that represents the class score
     # print(self.model.layers[36])
     # print(self.model_layer_up)
     # output = self.model.layers[36].output
     Q = T.take(self.model_layer_up, class1, axis=1)
     return self.for_quantity(Q)
Exemple #7
0
    def _fetch_class_embeddings(self, indices):
        assert indices is not None

        # For the single positive example, indices is a vector of
        # just (batch_size,), when using negative sampling, it is a
        # two-dimensional tensor of size (batch_size, num_neg_samples).
        #
        # target_embeddings is a tensor of size
        # (batch_size, num_candidates, embedding_size)
        target_embeddings = T.take(self.entity_representations,
                                   indices,
                                   axis=0)

        if indices.ndim == 1:
            # This reshapes to (batch_size,
            #                   num_candidates=1,
            #                   embedding_size).
            target_embeddings = target_embeddings.dimshuffle(0, 'x', 1)

        assert target_embeddings.ndim == 3

        # At this stage, target_embeddings is a tensor of size
        # (batch_size, num_candidates, embedding_size) for all cases.
        target_embeddings = target_embeddings.dimshuffle(0, 'x', 1, 2)

        # Now, target_embeddings is of size
        # (batch_size, window_size=1, num_candidates, embedding_size).
        assert target_embeddings.ndim == 4

        return target_embeddings
Exemple #8
0
    def _nll_mixed(self, hid, X, ftypes = None, mask = None, params = None):
        """
        Calculate Negative Log Likelihood of Mixed Data [Binary and Real-Valued]
        Model:  hid is a function of model parameters 
        Shapes: hid.shape[1] =  [#binary, #real, #real]   
        
        Added support for 2D & 3D observations 
        * Add support for allowing a fixed covariance (not learned)
        * Add support for modeling multinomial (categorical) RVs
        """
        if ftypes is None:
            raise ValueError('Expecting feature_types to be specified as a list')
        if not np.all(np.unique(ftypes)==np.unique(['binary','continuous'])):
            raise ValueError('Check data types - only binary and real supported')
        binary_idx = np.where(ftypes=='binary')[0]    
        real_idx   = np.where(ftypes=='continuous')[0]
        lbin, lreal= len(binary_idx), len(real_idx)
        mask_bin, mask_real = None, None
        if mask is not None:
            mask_bin = T.take(mask, binary_idx, axis=-1)
            mask_real= T.take(mask, real_idx, axis=-1)
        X_bin        = T.take(X,binary_idx,axis=-1)
        binary_hid   = T.take(hid, T.arange(lbin), axis=-1) 
        nll_bin      = self._nll_binary(binary_hid, X_bin, params = params, mask=mask_bin) 

        X_real            = T.take(X, real_idx, axis=-1)
        mu_idx            = T.arange(lbin,lbin+lreal)
        real_hid_mu       = T.take(hid, mu_idx,axis=-1)
        logcov_idx        = T.arange(lbin+lreal,lbin+2*lreal)
        real_hid_logcov   = T.take(hid,logcov_idx,axis=-1)
        nll_real          = self._nll_gaussian(real_hid_mu, real_hid_logcov, X_real, params = params, mask=mask_real)
        return T.concatenate([nll_bin, nll_real], axis=-1)
Exemple #9
0
    def dex_cost(self, I, dex_lam=0.00):
        """
        Simple exemplar-svm-like function to optimize.

        This loss is based on unnormalized grounded grounded density
        estimation via Negative Sampling -- Noise-Contrastive Estimation.
        """
        #assert(I.shape[0] == self.X_in.shape[0])
        Wt = T.take(self.W, I, axis=0)
        bt = T.take(self.b, I)
        k = I.size - 1
        F = T.dot(self.X_in, Wt.T) + bt
        #F = T.dot(self.X_in, self.X_in.T)
        mask = T.ones_like(F) - T.identity_like(F)
        dex_loss = T.sum((mask * F) + T.log(1.0 + k*T.exp(-F))) / (k + 1)
        reg_loss = dex_lam * T.sum(F**2.0) / (k + 1)
        C = dex_loss + reg_loss
        self.dW = T.grad(C, Wt)
        self.db = T.grad(C, bt)
        return C
Exemple #10
0
def extend_axis_rev(array, axis):
    if axis < 0:
        axis = axis % array.ndim
    assert axis >= 0 and axis < array.ndim

    n = array.shape[axis]
    last = tt.take(array, [-1], axis=axis)

    sum_vals = -last * np.sqrt(n)
    norm = sum_vals / (np.sqrt(n) + n)
    slice_before = (slice(None, None), ) * axis
    return array[slice_before + (slice(None, -1), )] + norm
Exemple #11
0
 def _construct_post_pea_costs(self):
     """
     Construct the pseudo-ensemble agreement cost on the categorical
     distributions output by the label generator.
     """
     # get the two sets of predictions for the input batch, with each set
     # of predictions given by independent passes through the noisy nets
     b_size = self.Yp2.shape[0] / 2
     idx = T.arange(0, stop=b_size)
     x1 = T.take(self.Yp2, idx, axis=0)
     x2 = T.take(self.Yp2, idx+b_size, axis=0)
     # construct a mask that zeros-out unsupervised rows
     row_idx = T.arange(self.Yd.shape[0])
     row_mask = T.neq(self.Yd, 0).reshape((self.Yd.shape[0], 1))
     # compute PEA reg costs for supervised and unsupervised points
     pea_costs = (smooth_kl_divergence(x1, x2, lam_smooth=5e-3) + \
             smooth_kl_divergence(x2, x1, lam_smooth=5e-3)) / 2.0
     pea_cost_su = T.sum(row_mask * pea_costs) / (T.sum(row_mask) + 1e-4)
     pea_cost_un = T.sum((1.0 - row_mask) * pea_costs) / \
             (T.sum(1.0 - row_mask) + 1e-4)
     pea_costs = [pea_cost_su, pea_cost_un]
     return pea_costs
Exemple #12
0
    def symbolic_call(self,x,u):
        dt = self.dt

        a = TT.take(x,0,axis=x.ndim-1)
        adot = TT.take(x,1,axis=x.ndim-1)
        g = 10.
        m = 1.
        l = 1.

        u = TT.clip(u, -self.max_torque, self.max_torque) #pylint: disable=E1111

        newadot = adot + (-3*g/(2*l) * TT.sin(a + np.pi) + 3./(m*l**2)*TT.take(u,0,axis=u.ndim-1)) * dt
        newa = a + newadot*dt
        newadot = TT.clip(newadot, -self.max_speed, self.max_speed) #pylint: disable=E1111

        newx = TT.stack(newa, newadot).T #pylint: disable=E1103


        x0 = TT.take(x,0,axis=x.ndim-1)
        x1 = TT.take(x,1,axis=x.ndim-1)
        costs = TT.stack(angle_normalize(x0)**2 + .1*x1**2, .001*(u**2).sum(axis=u.ndim-1)).T #pylint: disable=E1103

        return [newx, newx, costs]
def tensor_gather_helper(gather_indices, gather_from, batch_size, range_size,
                         gather_shape):
    """
    :param gather_from: [batch_size, range_size, ...]

    :param gather_indices: [batch_size, beam_size]
    """

    range_ = (T.arange(batch_size) * range_size)[:, None]  # [batch_size, 1]
    gather_indices_ = (gather_indices +
                       range_).flatten()  # [batch_size * range_size]

    output = T.take(gather_from.reshape(gather_shape), gather_indices_, axis=0)

    final_shape = gather_from.shape[:1 + len(gather_shape)]
    output = output.reshape(final_shape)

    return output
Exemple #14
0
 def cost_rates(self, x,u):
     v = TT.take(x,5,axis=1)
     return [-self.lin_vel_coeff*v, self.ctrl_cost_coeff*(u**2).sum(axis=1)]
Exemple #15
0
def false_vector(x):
    """return tensor with last dimension removed, with all entries = False"""
    return TT.take(x,0,axis=x.ndim-1) > np.inf
Exemple #16
0
def take(x, indices, axis=None):
    return T.take(x, indices, axis=axis)
Exemple #17
0
  def __theano_init__(self):
    X = T.lmatrix('X') # (batch_size, n_gram)
    Y = T.lvector('Y') # (batch_size, )
    N = T.lmatrix('N') # (batch_size, noise_sample_size)
    CC = T.tile(self.C, (1, self.n_gram)) # (hidden_dim1, word_dim * n_gram)
    CCb = T.tile(self.Cb, (1, self.batch_size)) # (hidden_dim1, batch_size)
    MMb = T.tile(self.Mb, (1, self.batch_size)) # (hidden_dim2, batch_size)
    EEb = T.tile(self.Eb, (1, self.batch_size)) # (vocab_size, batch_size)
    
    Du = self.D.take(X.T, axis = 1).T # (batch_size, n_gram, word_dim)
    h1 = T.nnet.relu(CC.dot(T.flatten(Du, outdim=2).T) + CCb) # (hidden_dim1, batch_size)
    h2 = T.nnet.relu(self.M.dot(h1) + MMb) # (hidden_dim2, batch_size)
    O = T.exp(self.E.dot(h2) + EEb).T # (batch_size, vocab_size)

    """
    # x is integer vector representing the n-gram -- each element is an index of a word
    def fprop_step(x, D, CC, M, E, b, n_gram, word_dim):
      Du = D.take(x, axis=1)
      h1 = T.nnet.relu(CC.dot(T.reshape(Du, (n_gram * word_dim, 1))))
      h2 = T.nnet.relu(M.dot(h1))
      return T.exp(E.dot(h2) + b).T[0] # r for raw distribution, a.k.a unnormalized 

    (O, _) = theano.scan(fn = fprop_step,
      sequences = X,
      outputs_info = None,
      non_sequences = [self.D, CC, self.M, self.E, self.b, self.n_gram, self.word_dim],
      strict=True) # (batch_size, vocab_size)
    """

    predictions = T.argmax(O, axis=1)
    xent = T.sum(T.nnet.categorical_crossentropy(O, Y))

    YY = Y + self.offset # offset indexes used to construct pw and qw
    NN = N + T.tile(self.offset, (self.noise_sample_size, 1)).T # offset indexes used to construct pwb and qwb

    pw = T.take(O, YY) # (batch_size, )
    qw = T.take(self.noise_dist, Y) # (batch_size, )
    pwb = T.take(O, NN) # (batch_size, noise_sample_size)
    qwb = T.take(self.noise_dist, N) # (batch_size, noise_sample_size)
    
    pd1 = pw / (pw + self.noise_sample_size * qw) # (batch_size, )
    pd0 = (self.noise_sample_size * qwb) / (pwb + self.noise_sample_size * qwb) # (batch_size, noise_sample_size)

    loss = T.sum(T.log(pd1) + T.sum(T.log(pd0), axis=1)) # scalar
    dD = T.grad(loss, self.D)
    dC = T.grad(loss, self.C)
    dM = T.grad(loss, self.M)
    dE = T.grad(loss, self.E)
    dCb = T.grad(loss, self.Cb)
    dMb = T.grad(loss, self.Mb)
    dEb = T.grad(loss, self.Eb)
    
    lr = T.scalar('lr', dtype=theano.config.floatX)

    self.pred = theano.function(inputs = [X], outputs = predictions)
    self.xent = theano.function(inputs = [X, Y], outputs = xent)
    self.loss = theano.function(inputs = [X, Y, N], outputs = loss)
    self.backprop = theano.function(inputs = [X, Y, N], outputs = [dD, dC, dM, dE, dCb, dMb, dEb])
    self.sgd = theano.function(inputs = [X, Y, N, lr], outputs = [], 
        updates = [
            (self.D, self.D + lr * dD),
            (self.C, self.C + lr * dC),
            (self.M, self.M + lr * dM),
            (self.E, self.E + lr * dE),
            (self.Cb, self.Cb + lr * dCb), 
            (self.Mb, self.Mb + lr * dMb), 
            (self.Eb, self.Eb + lr * dEb), 
            ])
    self.weights = theano.function(inputs = [], outputs = [self.D, self.C, self.M, self.E, self.Cb, self.Mb, self.Eb])
Exemple #18
0
    def init_function(self):

        self.seq_loc = T.lvector()
        self.seq_idx = T.lvector()
        self.target = T.lvector()
        self.target_content_index = T.lscalar()
        self.seq_len = T.lscalar()
        self.solution = T.matrix()
        self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0)

        self.all_tar_vector = T.take(self.Vw, self.target, axis=0)
        self.tar_vector = T.mean(self.all_tar_vector, axis=0)
        self.target_vector_dim = self.tar_vector.dimshuffle('x', 0)
        self.seq_matrix = T.concatenate([self.seq_matrix[0:self.target_content_index], self.target_vector_dim,
                                         self.seq_matrix[self.target_content_index + 1:]], axis=0)
        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(self.bc,
                                                                               dtype=theano.config.floatX)

        def rnn(X, aspect):
            def encode_forward(x_t, h_fore, c_fore):
                v = T.concatenate([h_fore, x_t])
                f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
                i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
                o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
                c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
                h_next = o_t * T.tanh(c_next)
                return h_next, c_next

            def encode_backward(x_t, h_fore, c_fore):
                v = T.concatenate([h_fore, x_t])
                f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
                i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
                o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
                c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
                h_next = o_t * T.tanh(c_next)
                return h_next, c_next

            loc_for = T.zeros_like(self.seq_loc) + self.target_content_index
            al_for = self.a_for_left * T.exp(
                -self.b_for_left * T.abs_(
                    self.seq_loc[0:self.target_content_index] - loc_for[0:self.target_content_index]))
            am_for = self.a_for_middle * [1]
            a_for = T.concatenate([al_for, am_for])
            locate_for = T.zeros_like(self.seq_matrix[0:self.target_content_index + 1],
                                      dtype=T.config.floatX) + T.reshape(a_for, [-1, 1])
            loc_back = T.zeros_like(self.seq_loc) + self.target_content_index
            ar_back = self.a_back_right * T.exp(
                -self.b_back_right * T.abs_(
                    self.seq_loc[self.target_content_index + 1:] - loc_back[self.target_content_index + 1:]))
            ar_back = ar_back[::-1]
            a_back = T.concatenate([am_for, ar_back])
            locate_back = T.zeros_like(self.seq_matrix[self.target_content_index:], dtype=T.config.floatX) + T.reshape(
                a_back, [-1, 1])

            scan_result_forward, _forward = theano.scan(fn=encode_forward,
                                                        sequences=locate_for * X[0:self.target_content_index + 1],
                                                        outputs_info=[h, c])
            scan_result_backward, _backward = theano.scan(fn=encode_backward,
                                                          sequences=locate_back * X[self.target_content_index:][::-1],
                                                          outputs_info=[h, c])
            embedding_l = scan_result_forward[0]
            embedding_r = scan_result_backward[0]
            h_target_for = embedding_l[-1]
            h_target_back = embedding_r[-1]

            attention_h_target_l = embedding_l
            cont_l = T.concatenate([h_target_for, h_target_back])
            yuyi_l = T.transpose(cont_l)
            alpha_h_l = T.dot(T.dot(attention_h_target_l, self.alpha_h_W_L), yuyi_l)
            alpha_tmp_l = T.nnet.softmax(alpha_h_l)
            r_l = T.dot(alpha_tmp_l, embedding_l)
            h_star_L = T.tanh(T.dot(r_l, self.Wp_L))

            attention_h_target_r = embedding_r
            cont_r = T.concatenate([h_target_for, h_target_back])
            yuyi_r = T.transpose(cont_r)

            alpha_h_r = T.dot(T.dot(attention_h_target_r, self.alpha_h_W_R), yuyi_r)
            alpha_tmp_r = T.nnet.softmax(alpha_h_r)
            r_r = T.dot(alpha_tmp_r, embedding_r)
            h_star_R = T.tanh(T.dot(r_r, self.Wp_R))
            embedding = T.concatenate([h_star_L, h_star_R],
                                      axis=1)
            return embedding

        embedding = rnn(self.seq_matrix, self.tar_vector)
        embedding_for_train = embedding * self.srng.binomial(embedding.shape, p=0.5, n=1, dtype=embedding.dtype)
        embedding_for_test = embedding * 0.5

        self.pred_for_train = T.nnet.softmax(T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param ** 2) for param in self.params]) - T.sum(self.Vw ** 2)
        self.loss_sen = -T.tensordot(self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.5 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                                         dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        self.func_train = theano.function(
            inputs=[self.seq_idx, self.target, self.solution,
                    self.target_content_index, self.seq_loc, self.seq_len,
                    theano.In(h, value=self.h0),
                    theano.In(c, value=self.c0)],
            outputs=[self.loss, self.loss_sen, self.loss_l2],
            updates=self.updates,
            on_unused_input='warn')

        self.func_test = theano.function(
            inputs=[self.seq_idx, self.target, self.target_content_index, self.seq_loc, self.seq_len,
                    theano.In(h, value=self.h0),
                    theano.In(c, value=self.c0)],
            outputs=self.pred_for_test,
            on_unused_input='warn')
    def build_model(self):
        views_curr = T.tensor4('views')
        action_hists_curr = T.tensor4('action_hists')
        actions = T.icol('actions')
        views_next = T.tensor4('next_views')
        action_hists_next = T.tensor4('next_action_hists')
        rewards = T.col('rewards')
        terminals = T.icol('terminals')

        # initialize network(s) for computing q-values
        net_online_in_view, net_online_in_action_hist, self.net_online_out, self.all_layers = \
         self.build_network(self.network_builder, self.view_size, self.action_hist_size)
        net_online_in_curr = {net_online_in_view: views_curr, net_online_in_action_hist: action_hists_curr} \
         if self.action_hist_size.w > 0 else {net_online_in_view: views_curr}
        q_vals_online_curr_train = lasagne.layers.get_output(
            self.net_online_out, net_online_in_curr, deterministic=False)
        q_vals_online_curr_test = lasagne.layers.get_output(
            self.net_online_out, net_online_in_curr, deterministic=True)
        # for predictions we always use the q-values estimated by the online network on the current state
        q_vals_pred_train = q_vals_online_curr_train
        q_vals_pred_test = q_vals_online_curr_test
        if self.clone_interval > 0:
            net_target_in_view, net_target_in_action_hist, self.net_target_out, _ = \
             self.build_network(self.network_builder, self.view_size, self.action_hist_size)
            self._clone()
            net_target_in_next = {net_target_in_view: views_next, net_target_in_action_hist: action_hists_next} \
             if self.action_hist_size.w > 0 else {net_target_in_view: views_next}
            # predict q-values for next state with target network
            q_vals_target_next = lasagne.layers.get_output(
                self.net_target_out, net_target_in_next)
            if self.double_q:
                # Double Q-Learning:
                # use online network to choose best action on next state (q_vals_target_argmax)...
                net_online_in_next = {net_online_in_view: views_next, net_online_in_action_hist: action_hists_next} \
                 if self.action_hist_size.w > 0 else {net_online_in_view: views_next}
                q_vals_online_next = lasagne.layers.get_output(
                    self.net_online_out, net_online_in_next)
                q_vals_target_argmax = T.argmax(q_vals_online_next,
                                                axis=1,
                                                keepdims=False)
                # ...but use target network to estimate q-values for these actions
                q_vals_target = T.diagonal(
                    T.take(q_vals_target_next, q_vals_target_argmax,
                           axis=1)).reshape((-1, 1))
            else:
                q_vals_target = T.max(q_vals_target_next,
                                      axis=1,
                                      keepdims=True)
        else:
            net_target_in_next = {net_online_in_view: views_next, net_online_in_action_hist: action_hists_next} \
             if self.action_hist_size.w > 0 else {net_online_in_view: views_next}
            q_vals_online_next = lasagne.layers.get_output(
                self.net_online_out, net_target_in_next)
            q_vals_target = T.max(q_vals_online_next, axis=1, keepdims=True)
        # define loss computation
        actionmask = T.eq(
            T.arange(len(self.actions)).reshape((1, -1)),
            actions.reshape((-1, 1))).astype(theano.config.floatX)
        terminals_float = terminals.astype(theano.config.floatX)
        target = rewards + \
           (T.ones_like(terminals_float) - terminals_float) * \
           self.discount * q_vals_target
        output = (q_vals_pred_train * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output
        if self.clip_delta > 0:
            # see https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = quadratic_part**2 + self.clip_delta * linear_part
        else:
            loss = diff**2

        # regularization
        if self.all_layers is not None and self.regularization > 0:
            l2reg = 0
            for lll in self.all_layers:
                l2reg += regularize_layer_params(lll, l2) * self.regularization
            loss = T.mean(loss) + l2reg  # batch accumulator sum or mean
        else:
            loss = T.mean(loss)

        # define network update for training
        params = lasagne.layers.helper.get_all_params(self.net_online_out,
                                                      trainable=True)
        updates = self.optimizer(loss, params)
        train_givens = self.shared_batch.givens(views_curr, action_hists_curr,
                                                actions, views_next,
                                                action_hists_next, rewards,
                                                terminals)
        self.train_fn = theano.function([], [loss],
                                        updates=updates,
                                        givens=train_givens)

        # define output prediction
        predict_givens = self.shared_state.givens(views_curr,
                                                  action_hists_curr)
        self.predict_fn = theano.function([],
                                          q_vals_pred_test[0],
                                          givens=predict_givens)
Exemple #20
0
    def init_function(self):
        self.seq_L_idx = T.lvector()
        self.seq_R_idx = T.lvector()
        self.tar_scalar = T.lscalar()
        self.solution = T.matrix()
        self.seq_matrix_L = T.take(self.Vw, self.seq_L_idx, axis=0)
        self.seq_matrix_R = T.take(self.Vw, self.seq_R_idx, axis=0)
        self.tar_vector = T.take(self.Va, self.tar_scalar, axis=0)

        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(
            self.bc, dtype=theano.config.floatX)

        def encode(x_t, h_fore, c_fore, tar_vec):
            v = T.concatenate([h_fore, x_t, tar_vec])
            f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
            i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
            o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
            c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
            h_next = o_t * T.tanh(c_next)
            return h_next, c_next

        def encode_R(x_t, h_fore, c_fore, tar_vec):
            v = T.concatenate([h_fore, x_t, tar_vec])
            f_t = T.nnet.sigmoid(T.dot(self.Wf_R, v) + self.bf_R)
            i_t = T.nnet.sigmoid(T.dot(self.Wi_R, v) + self.bi_R)
            o_t = T.nnet.sigmoid(T.dot(self.Wo_R, v) + self.bo_R)
            c_next = f_t * c_fore + i_t * T.tanh(
                T.dot(self.Wc_R, v) + self.bc_R)
            h_next = o_t * T.tanh(c_next)
            return h_next, c_next

        scan_result_L, _ = theano.scan(fn=encode,
                                       sequences=[self.seq_matrix_L],
                                       outputs_info=[h, c],
                                       non_sequences=[self.tar_vector])
        embedding_L = scan_result_L[
            0]  # embedding in there is a matrix, include[h_1, ..., h_n]

        scan_result_R, _ = theano.scan(fn=encode_R,
                                       sequences=[self.seq_matrix_R],
                                       outputs_info=[h, c],
                                       non_sequences=[self.tar_vector])
        embedding_R = scan_result_R[
            0]  # embedding in there is a matrix, include[h_1, ..., h_n]

        hstar_L = embedding_L[-1]
        hstar_R = embedding_R[-1]
        embedding = T.tanh(hstar_L + hstar_R)
        # attention
        # matrix_aspect = T.zeros_like(embedding, dtype=theano.config.floatX)[:,:self.dim_aspect] + self.tar_vector
        # hhhh = T.concatenate([T.dot(embedding, self.Wh), T.dot(matrix_aspect, self.Wv)], axis=1)
        # M_tmp = T.tanh(hhhh)
        # alpha_tmp = T.nnet.softmax(T.dot(M_tmp, self.w))
        # r = T.dot(alpha_tmp, embedding)
        # h_star = T.tanh(T.dot(r, self.Wp) + T.dot(embedding[-1], self.Wx))
        # embedding = h_star # embedding in there is a vector, represent h_n_star

        # dropout
        embedding_for_train = embedding * self.srng.binomial(
            embedding.shape, p=0.5, n=1, dtype=embedding.dtype)
        embedding_for_test = embedding * 0.5

        self.pred_for_train = T.nnet.softmax(
            T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(
            T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param**2)
                       for param in self.params]) - T.sum(self.Vw**2)
        self.loss_sen = -T.tensordot(
            self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.5 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                    dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        self.func_train = theano.function(
            inputs=[
                self.seq_L_idx, self.seq_R_idx, self.tar_scalar, self.solution,
                theano.In(h, value=self.h0),
                theano.In(c, value=self.c0)
            ],
            outputs=[self.loss, self.loss_sen, self.loss_l2],
            updates=self.updates,
            on_unused_input='warn')

        self.func_test = theano.function(inputs=[
            self.seq_L_idx, self.seq_R_idx, self.tar_scalar,
            theano.In(h, value=self.h0),
            theano.In(c, value=self.c0)
        ],
                                         outputs=self.pred_for_test,
                                         on_unused_input='warn')
Exemple #21
0
def ctc_objective(y_pred, y, y_pred_mask=None, y_mask=None, batch=True):
	''' CTC objective.

	Parameters
	----------
	y_pred : [nb_samples, in_seq_len, nb_classes+1]
		softmax probabilities
	y : [nb_samples, out_seq_len]
		output sequences
	y_mask : [nb_samples, out_seq_len]
		mask decides which labels in y is included (0 for ignore, 1 for keep)
	y_pred_mask : [nb_samples, in_seq_len]
		mask decides which samples in input sequence are used
	batch : True/False
		if batching is not used, nb_samples=1
		Note: the implementation without batch support is more reliable

	Returns
	-------
	grad_cost : the cost you calculate gradient on
	actual_cost : the cost for monitoring model performance (*NOTE: do not calculate
		gradient on this cost)

	Note
	----
	According to @Richard Kurle:
		test error of 38% with 1 bidirectional LSTM layer or with a stack of 3,
		but I could not reproduce the results to those reported in Grave's paper.

		If you get blanks only, you probably have just bad hyperparameters or you
		did not wait enough epochs. At the beginnign of the training,
		only the cost decreases but you don't see yet any characters popping up.

		You will need gradient clipping to prevent exploding gradients as well.
	'''
	y_pred_mask = y_pred_mask if y_pred_mask is not None else T.ones((y_pred.shape[0], y_pred.shape[1]), dtype=floatX)
	y_mask = y_mask if y_mask is not None else T.ones(y.shape, dtype=floatX)
	if batch:
		# ====== reshape input ====== #
		y_pred = y_pred.dimshuffle(1, 0, 2)
		y_pred_mask = y_pred_mask.dimshuffle(1, 0)
		y = y.dimshuffle(1, 0)
		y_mask = y_mask.dimshuffle(1, 0)

		# ====== calculate cost ====== #
		grad_cost = _pseudo_cost(y, y_pred, y_mask, y_pred_mask, False)
		grad_cost = grad_cost.mean()
		monitor_cost = _cost(y, y_pred, y_mask, y_pred_mask, True)
		monitor_cost = monitor_cost.mean()

		return grad_cost, monitor_cost
	else:
		y = T.cast(y, dtype='int32')

		# batch_size=1 => just take [0] to reduce 1 dimension
		y_pred = y_pred[0]
		y_pred_mask = y_pred_mask[0]
		y = y[0]
		y_mask = y_mask[0]

		# after take, ndim=2 go up to 3, need to be reduced back to 2
		y_pred = T.take(y_pred, T.nonzero(y_pred_mask, return_matrix=True), axis=0)[0]
		y = T.take(y, T.nonzero(y_mask, return_matrix=True), axis=0).ravel()

		return _cost_no_batch(y_pred, y)
Exemple #22
0
 def apply(self, input_):
     if self._dot:
         return tensor.dot(input_, self._matrix)
     else:
         return tensor.take(input_, self._permutation, axis=1)
    def init_function(self):
        self.seq_idx = T.lvector() 
        self.tar_scalar = T.lscalar()
        self.solution = T.matrix()
        self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0)
        self.tar_vector = T.take(self.Vd, self.tar_scalar, axis=0)

        h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like(self.bc, dtype=theano.config.floatX)

        def encode(x_t, h_fore, c_fore, tar_vec):
            v = T.concatenate([h_fore, x_t])
            f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf)
            i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi)
            o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo)
            c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc)
            h_next = o_t * T.tanh(c_next)
            return h_next, c_next

        scan_result, _ = theano.scan(fn=encode, sequences=[self.seq_matrix], outputs_info=[h, c], non_sequences=[self.tar_vector])
        embedding = scan_result[0] # embedding in there is a matrix, include[h_1, ..., h_n]

        # attention
        
        tmp1 = tensor.sum(self.tar_vector**2, axis=0).reshape((1, -1))
        tmp2 = tensor.sum(embedding**2, axis=1).reshape((1,-1))
        tmp3 = tensor.dot(tmp1, tmp2)

        e = tensor.dot(embedding, self.tar_vector) / tensor.sqrt(tmp3)
        alpha_tmp = T.nnet.softmax(e)
        '''
        matrix_doc = T.zeros_like(embedding, dtype=theano.config.floatX)[:,:self.dim_doc] + self.tar_vector
        hhhh = T.concatenate([T.dot(embedding, self.Wh), T.dot(matrix_doc, self.Wv)], axis=1)
        M_tmp = T.tanh(hhhh)
        alpha_tmp = T.nnet.softmax(T.dot(M_tmp, self.w))
        '''
        r = T.dot(alpha_tmp, embedding)
        h_star = T.tanh(T.dot(r, self.Wp) + T.dot(embedding[-1], self.Wx))
        embedding = h_star # embedding in there is a vector, represent h_n_star
 
        # dropout
        embedding_for_train = embedding * self.srng.binomial(embedding.shape, p = 0.5, n = 1, dtype=embedding.dtype)
        embedding_for_test = embedding * 0.5
            
        self.pred_for_train = T.nnet.softmax(T.dot(embedding_for_train, self.Ws) + self.bs)
        self.pred_for_test = T.nnet.softmax(T.dot(embedding_for_test, self.Ws) + self.bs)

        self.l2 = sum([T.sum(param**2) for param in self.params]) - T.sum(self.Vw**2)
        self.loss_sen = -T.tensordot(self.solution, T.log(self.pred_for_train), axes=2)
        self.loss_l2 = 0.7 * self.l2 * self.regular
        self.loss = self.loss_sen + self.loss_l2

        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                    dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        self.func_train = theano.function(
                inputs = [self.seq_idx, self.tar_scalar, self.solution, theano.In(h, value=self.h0), theano.In(c, value=self.c0)],
                outputs = [self.loss, self.loss_sen, self.loss_l2],
                updates = self.updates,
                on_unused_input='warn')

        self.func_test = theano.function(
                inputs = [self.seq_idx, self.tar_scalar, theano.In(h, value=self.h0), theano.In(c, value=self.c0)],
                outputs = self.pred_for_test,
                on_unused_input='warn')
Exemple #24
0
 def dumb_update(self, I, learn_rate):
     """Theano doesn't like lookup tables."""
     T.inc_subtensor(T.take(self.W, I, axis=0), -learn_rate*self.dW, inplace=True)
     T.inc_subtensor(T.take(self.b, I), -learn_rate*self.db, inplace=True)
     return
    def init_function(self):
        logging.info('init function...')

        self.data = T.lmatrix()
        self.label = T.vector()

        def encode(t, h_prev, c_prev):
            x_t = self.V_all[self.v_num[t[0]] + t[2]]
            i_t = T.nnet.sigmoid(
                T.dot(x_t, self.W_i) + T.dot(h_prev, self.U_i) + self.b_i)
            f_t = T.nnet.sigmoid(
                T.dot(x_t, self.W_f) + T.dot(h_prev, self.U_f) + self.b_f)
            c_c = T.tanh(
                T.dot(x_t, self.W_c) + T.dot(h_prev, self.U_c) + self.b_c)
            c = i_t * c_c + f_t * c_prev
            o_t = T.nnet.sigmoid(
                T.dot(x_t, self.W_o) + T.dot(h_prev, self.U_o) + self.b_o)
            h = o_t * T.tanh(c)
            return h, c


        [hf_history, _], _ = theano.scan(encode, sequences=self.data, \
                outputs_info=[dict(initial=T.zeros(self.dim_hidden)),\
                dict(initial=T.zeros(self.dim_hidden))])

        pred = T.nnet.softmax(T.dot(hf_history, self.W_hy) + self.b_hy)
        unit = theano.shared(np.array([[1.0/self.grained]*self.grained], \
                dtype=theano.config.floatX))
        pred_last = T.concatenate([unit, pred[:-1]], 0)

        def loss_kl(last, now, t):
            index = T.switch(T.eq(T.transpose(self.data)[0], t), 1, 0)
            dis = -T.sum(last * T.log(now), 1) - T.sum(T.log(last) * now, 1)
            return T.mean(
                T.maximum(0, dis - self.margin) * index) * self.innear

        self.loss_common = loss_kl(pred_last, pred, 3)

        v_sent = T.take(self.sentiment_vector, self.data[:, 0], 0)
        pred_last_sentiment = T.nnet.softmax(pred_last + v_sent)
        self.loss_sentiment = loss_kl(pred_last_sentiment, pred, 2)

        w_neg = T.take(self.W_neg,
                       T.minimum(self.num['negation'] - 1, self.data[:, 2]), 0)
        pred_last_negation = T.nnet.softmax(
            T.batched_tensordot(w_neg, pred_last, [[1], [1]]))
        self.loss_negation = loss_kl(pred_last_negation, pred, 0)

        w_int = T.take(self.W_int,
                       T.minimum(self.num['intensifier'] - 1, self.data[:, 2]),
                       0)
        pred_last_intensifier = T.nnet.softmax(
            T.batched_tensordot(w_int, pred_last, [[1], [1]]))
        self.loss_intensifier = loss_kl(pred_last_intensifier, pred, 1)

        self.loss_innear = self.loss_common + self.loss_sentiment \
                + self.loss_negation + self.loss_intensifier

        hf = hf_history[-1]
        embedding = hf

        self.use_noise = theano.shared(
            np.asarray(0., dtype=theano.config.floatX))

        if self.dropout == 1:
            embedding_for_train = embedding * self.srng.binomial(embedding.shape, \
                    p = 0.5, n = 1, dtype=embedding.dtype)
            embedding_for_test = embedding * 0.5
        else:
            embedding_for_train = embedding
            embedding_for_test = embedding

        self.pred_for_train = T.nnet.softmax(
            T.dot(embedding_for_train, self.W_hy) + self.b_hy)[0]
        self.pred_for_test = T.nnet.softmax(
            T.dot(embedding_for_test, self.W_hy) + self.b_hy)[0]

        self.l2 = sum([T.sum(param**2) for param in self.params]) \
                - sum([T.sum(param**2) for param in self.V])
        self.loss_supervised = -T.sum(self.label * T.log(self.pred_for_train))
        self.loss_l2 = 0.5 * self.l2 * self.regular
        self.loss = self.loss_supervised + self.loss_l2 + self.loss_innear

        logging.info('getting grads...')
        grads = T.grad(self.loss, self.params)
        self.updates = collections.OrderedDict()
        self.grad = {}
        for param, grad in zip(self.params, grads):
            g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \
                    dtype=theano.config.floatX))
            self.grad[param] = g
            self.updates[g] = g + grad

        logging.info("compiling func of train...")
        self.func_train = theano.function(
                inputs = [self.label, self.data],
                outputs = [self.loss, self.loss_supervised, self.loss_l2, self.loss_innear, \
                        self.loss_common, self.loss_sentiment, \
                        self.loss_negation, self.loss_intensifier],
                updates = self.updates,
                on_unused_input='warn')

        logging.info("compiling func of test...")
        self.func_test = theano.function(
            inputs=[self.label, self.data],
            outputs=[self.loss_supervised, self.pred_for_test],
            on_unused_input='warn')
def train_dex(
    NET,
    sgd_params,
    datasets):
    """
    Do DEX training.
    """
    initial_learning_rate = sgd_params['start_rate']
    learning_rate_decay = sgd_params['decay_rate']
    n_epochs = sgd_params['epochs']
    batch_size = sgd_params['batch_size']
    wt_norm_bound = sgd_params['wt_norm_bound']
    result_tag = sgd_params['result_tag']
    txt_file_name = "results_dex_{0}.txt".format(result_tag)
    img_file_name = "weights_dex_{0}.png".format(result_tag)

    # Get the training data and create arrays of start/end indices for
    # easy minibatch slicing
    Xtr = datasets[0][0]
    idx_range = np.arange(Xtr.get_value(borrow=True).shape[0])
    Ytr_shared = theano.shared(value=idx_range)
    Ytr = T.cast(Ytr_shared, 'int32')
    tr_samples = Xtr.get_value(borrow=True).shape[0]
    tr_batches = int(np.ceil(float(tr_samples) / batch_size))
    tr_bidx = [[i*batch_size, min(tr_samples, (i+1)*batch_size)] for i in range(tr_batches)]
    tr_bidx = T.cast(tr_bidx, 'int32')

    print "Dataset info:"
    print "  training samples: {0:d}".format(tr_samples)
    print "  samples/minibatch: {0:d}, minibatches/epoch: {1:d}".format( \
        batch_size, tr_batches)

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lvector()  # index to a [mini]batch
    epoch = T.scalar()   # epoch counter
    I = T.lvector()      # keys for the training examples
    x = NET.input        # symbolic matrix for inputs to NET
    learning_rate = theano.shared(np.asarray(initial_learning_rate,
        dtype=theano.config.floatX))
    dex_weight = T.scalar(name='dex_weight', dtype=theano.config.floatX)

    # Collect the parameters to-be-optimized
    opt_params = NET.proto_params

    # Build the expressions for the cost functions
    DL = NET.dex_layers[0]
    RL_costs = [RL.rica_cost() for RL in NET.rica_layers]

    NET_rica_cost = sum([rlc[0] for rlc in RL_costs])
    NET_rica_reg_cost = sum([rlc[2] for rlc in RL_costs])
    NET_dex_cost = DL.dex_cost(index)
    NET_reg_cost = NET.act_reg_cost
    NET_cost = NET_dex_cost + NET_reg_cost #NET_rica_cost + NET_dex_cost + NET_reg_cost
    NET_metrics = [NET_cost, NET_cost, NET_dex_cost, NET_reg_cost, \
            NET_reg_cost]

    ############################################################################
    # Prepare momentum and gradient variables, and construct the updates that  #
    # Theano will perform on the network parameters.                           #
    ############################################################################
    NET_grads = []
    for param in opt_params:
        NET_grads.append(T.grad(NET_cost, param, disconnected_inputs='warn'))

    NET_moms = []
    for param in opt_params:
        NET_moms.append(theano.shared(np.zeros( \
                param.get_value(borrow=True).shape, dtype=theano.config.floatX)))

    # Compute momentum for the current epoch
    mom = ifelse(epoch < 500,
        0.5*(1. - epoch/500.) + 0.99*(epoch/500.),
        0.99)

    # Use a "smoothed" learning rate, to ease into optimization
    gentle_rate = ifelse(epoch < 10,
        ((epoch / 10.) * learning_rate),
        learning_rate)

    # Update the step direction using a momentus update
    NET_updates = OrderedDict()
    for i in range(len(opt_params)):
        NET_updates[NET_moms[i]] = mom * NET_moms[i] + (1. - mom) * NET_grads[i]

    # ... and take a step along that direction
    for i in range(len(opt_params)):
        param = opt_params[i]
        grad_i = NET_grads[i]
        print("grad_{0:d}.owner.op: {1:s}".format(i, str(grad_i.owner.op)))
        NET_param = param - (gentle_rate * NET_updates[NET_moms[i]])
        # Clip the updated param to bound its norm (where applicable)
        if (NET.clip_params.has_key(param) and \
                (NET.clip_params[param] == 1)):
            NET_norms = T.sum(NET_param**2, axis=1, keepdims=1)
            NET_scale = T.clip(T.sqrt(wt_norm_bound / NET_norms), 0., 1.)
            NET_updates[param] = NET_param * NET_scale
        else:
            NET_updates[param] = NET_param

    # updates for dex layer
    NET_updates[DL.W] = T.inc_subtensor(T.take(DL.W, index, axis=0), \
            -gentle_rate*DL.dW)
    NET_updates[DL.b] = T.inc_subtensor(T.take(DL.b, index), \
            -gentle_rate*DL.db)

    # Compile theano functions for training.  These return the training cost
    # and update the model parameters.

    train_NET = theano.function(inputs=[epoch, index, dex_weight], \
        outputs=NET_metrics, \
        updates=NET_updates, \
        givens={ x: T.take(Xtr, index, axis=0) }, \
        on_unused_input='warn')

    # Theano function to decay the learning rate, this is separate from the
    # training function because we only want to do this once each epoch instead
    # of after each minibatch.
    set_learning_rate = theano.function(inputs=[], outputs=learning_rate,
        updates={learning_rate: learning_rate * learning_rate_decay})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    epoch_counter = 0
    start_time = time.clock()

    results_file = open(txt_file_name, 'wb')
    results_file.write("ensemble description: ")
    results_file.write("  **TODO: Write code for this.**\n")
    results_file.flush()

    b_index = npr.randint(0, high=tr_samples, size=(batch_size,))
    train_metrics = train_NET(0, b_index, 0.0)
    while epoch_counter < n_epochs:
        ######################################################
        # Process some number of minibatches for this epoch. #
        ######################################################
        e_time = time.clock()
        epoch_counter = epoch_counter + 1
        train_metrics = [0.0 for val in train_metrics]
        for minibatch_index in xrange(tr_batches):
            # Compute update for some joint supervised/unsupervised minibatch
            b_index = npr.randint(0, high=tr_samples, size=(batch_size,))
            dwight = 1.0 #0.0 if (epoch_counter <= 5) else 0.1
            batch_metrics = train_NET(epoch_counter, b_index, dwight)
            train_metrics = [a+b for (a, b) in zip(train_metrics, batch_metrics)]
        train_metrics = [(val / tr_batches) for val in train_metrics]

        # Update the learning rate
        new_learning_rate = set_learning_rate()

        # Report and save progress.
        print("epoch {0:d}: total={1:.4f}, rica={2:.4f}, dex={3:.4f}, reg={4:.4f}, rica_reg:{5:.4f}".format( \
                epoch_counter, train_metrics[0], train_metrics[1], train_metrics[2], \
                train_metrics[3], train_metrics[4]))
        print("--time: {0:.4f}".format((time.clock() - e_time)))
        # Save first layer weights to an image locally
        utils.visualize(NET, 0, 0, img_file_name)
Exemple #27
0
def take(a,indices,axis):
    if is_theanic(a) or is_theanic(indices):
        return tensor.take(a,indices,axis)
    else:
        return numpy.take(a,indices,axis)
 def comparative(self, class1, class2):
     Q1 = T.take(self.model_layer_up, class1, axis=1)
     Q2 = T.take(self.model_layer_up, class2, axis=1)
     return self.for_quantity(Q1 - Q2)