def build_model(self, train_set, test_set, validation_set):
        """
        Building the model should be done prior to training. It will implement the training, testing and validation
        functions.
        This method should be called from any subsequent inheriting model.
        :param loss: The loss funciton applied to training (cf. updates.py), e.g. mse.
        :param update: The update function (optimization framework) used for training (cf. updates.py), e.g. sgd.
        :param update_args: The args for the update function applied to training, e.g. (0.001,).
        """
        print "### BUILDING MODEL ###"

        self.train_args = {}
        self.train_args['inputs'] = OrderedDict({})
        self.train_args['outputs'] = OrderedDict({})

        self.test_args = {}
        self.test_args['inputs'] = OrderedDict({})
        self.test_args['outputs'] = OrderedDict({})

        self.validate_args = {}
        self.validate_args['inputs'] = OrderedDict({})
        self.validate_args['outputs'] = OrderedDict({})

        self.sym_index = T.iscalar('index')
        self.sym_batchsize = T.iscalar('batchsize')
        self.sym_lr = T.scalar('learningrate')
        self.batch_slice = slice(self.sym_index * self.sym_batchsize, (self.sym_index + 1) * self.sym_batchsize)

        self.sh_train_x = theano.shared(np.asarray(train_set[0], dtype=theano.config.floatX), borrow=True)
        self.sh_train_t = theano.shared(np.asarray(train_set[1], dtype=theano.config.floatX), borrow=True)
        self.sh_test_x = theano.shared(np.asarray(test_set[0], dtype=theano.config.floatX), borrow=True)
        self.sh_test_t = theano.shared(np.asarray(test_set[1], dtype=theano.config.floatX), borrow=True)
        if validation_set is not None:
            self.sh_valid_x = theano.shared(np.asarray(validation_set[0], dtype=theano.config.floatX), borrow=True)
            self.sh_valid_t = theano.shared(np.asarray(validation_set[1], dtype=theano.config.floatX), borrow=True)
Example #2
0
    def getMinibatchTrainer(self, costFunction, variableToData, rms=True):
        # define params        
        lr = T.fscalar('lr')    
        start = T.iscalar('start')
        end = T.iscalar('end')

        # Get the cost and its parameters.
        params = costFunction[0]
        cost = costFunction[1]

        # Get the updates.
        updates = self.getUpdates(cost, params, lr, rms)
        # Store all state variables.
        stateManager = StateManager([u[0] for u in updates])

        # Slice the data
        givens = dict()
        for item in variableToData:
            givens[item.variable] = item.slice(start,end)
        
                
        # Define the training function.
        train_model = theano.function(inputs=[theano.Param(start, borrow = True), 
                                              theano.Param(end, borrow=True), 
                                              theano.Param(lr, borrow=True)],
                                        outputs=theano.Out(cost, borrow=True),
                                        updates=updates,
                                        givens=givens)

        
        return train_model, stateManager
Example #3
0
    def f_train(self, t_x, t_corrupt = 0.2, t_rate = 0.1):
        """ return training function of the following signiture:
        input:
            lower and upper indices on training data
            alternative training data
        return:
            likelihood based cost
            square distance between training data and prediction
        
        """
        x = T.matrix('x')     # pipe data through this symble
        q = self.t_corrupt(x, t_corrupt)
        h = self.t_encode(q)
        z = self.t_decode(h)

        L = - T.sum(x * T.log(z) + (1 - x) * T.log(1 - z), axis=1)
        cost = T.mean(L)    # to be returned

        dist = T.mean(T.sqrt(T.sum((x - z) ** 2, axis = 1)))    # to be returned

        grad = T.grad(cost, self.parm)

        diff = [(p, p - t_rate * g) for p, g in zip(self.parm, grad)]

        t_fr = T.iscalar()
        t_to = T.iscalar()
        return theano.function(
            [t_fr, t_to],
            [cost, dist],
            updates = diff,
            givens = {x : t_x[t_fr:t_to]},
            name = "DA_trainer")
Example #4
0
def SimFnIdx(fnsim, embeddings, leftop, rightop):
    """
    This function returns a Theano function to measure the similarity score
    for a given triplet of entity indexes.

    :param fnsim: similarity function (on Theano variables).
    :param embeddings: an Embeddings instance.
    :param leftop: class for the 'left' operator.
    :param rightop: class for the 'right' operator.
    """
    embedding, relationl, relationr = parse_embeddings(embeddings)

    # Inputs
    idxo = T.iscalar('idxo')
    idxr = T.iscalar('idxr')
    idxl = T.iscalar('idxl')
    # Graph
    lhs = (embedding.E[:, idxl]).reshape((1, embedding.D))
    rhs = (embedding.E[:, idxr]).reshape((1, embedding.D))
    rell = (relationl.E[:, idxo]).reshape((1, relationl.D))
    relr = (relationr.E[:, idxo]).reshape((1, relationr.D))
    simi = fnsim(leftop(lhs, rell), rightop(rhs, relr))
    """
    Theano function inputs.
    :input idxl: index value of the 'left' member.
    :input idxr: index value of the 'right' member.
    :input idxo: index value of the relation member.

    Theano function output.
    :output simi: score value.
    """
    return theano.function([idxl, idxr, idxo], [simi],
            on_unused_input='ignore')
Example #5
0
def RankRightFnIdx_filtered(fnsim, embeddings, leftop, rightop, subtensorspec=None):
    """
    This function returns a Theano function to measure the similarity score of
    all 'right' entities given couples of relation and 'left' entities (as
    index values).
    """
    embedding, relationl, relationr = parse_embeddings(embeddings)
    # Inputs
    idxl, idxo = T.iscalar('idxl'), T.iscalar('idxo')
    rightparts = T.ivector('rightparts')
    # Graph
    lhs = (embedding.E[:, idxl]).reshape((1, embedding.D))              # lhs: 1xD vector containing the embedding of idxl
    if subtensorspec is not None:
        # We compute the score only for a subset of entities
        rhs = (embedding.E[:, :subtensorspec]).T
    else:
        rhs = embedding.E.T                                             # rhs: NxD embedding matrix

    rhs = rhs[rightparts, :]                                            # select the right parts not appearing
                                                                        # in the train/valid/test sets

    rell = (relationl.E[:, idxo]).reshape((1, relationl.D))             # rell: 1xD vector containing the embedding of idxo (relationl)
    relr = (relationr.E[:, idxo]).reshape((1, relationr.D))             # relr: 1xD vector containing the embedding of idxo (relationr)

    tmp = leftop(lhs, rell)                                             # a = rell(lhs)
                                                                        # b = relr(rhs)

    simi = fnsim(tmp.reshape((1, tmp.shape[1])), rightop(rhs, relr))    # simi = fnsim(a, b)
    return theano.function([idxl, idxo, rightparts], [simi], on_unused_input='ignore')
Example #6
0
def RankRightFnIdx_Schema(fnsim, embeddings, prior, leftop, rightop, subtensorspec=None):
    embedding, relationl, relationr = parse_embeddings(embeddings)

    # Inputs
    idxl, idxo = T.iscalar('idxl'), T.iscalar('idxo')
    g = T.matrix('g')

    # Graph
    lhs = (embedding.E[:, idxl]).reshape((1, embedding.D))              # lhs: 1xD vector containing the embedding of idxl

    if subtensorspec is not None:
        # We compute the score only for a subset of entities
        rhs = (embedding.E[:, :subtensorspec]).T
    else:
        rhs = embedding.E.T                                             # rhs: NxD embedding matrix

    rell = (relationl.E[:, idxo]).reshape((1, relationl.D))             # rell: 1xD vector containing the embedding of idxo (relationl)
    relr = (relationr.E[:, idxo]).reshape((1, relationr.D))             # relr: 1xD vector containing the embedding of idxo (relationr)

    tmp = leftop(lhs, rell)                                             # a = rell(lhs)
                                                                        # b = relr(rhs)

    # Negative Energy
    simi = fnsim(tmp.reshape((1, tmp.shape[1])), rightop(rhs, relr))    # simi = fnsim(a, b)

    pen_simi = g[0, :].T * prior.P[idxo, 0].T + g[1, :].T * prior.P[idxo, 1].T
    simi = simi - pen_simi

    return theano.function([idxl, idxo, g], [simi], on_unused_input='ignore')
def build_model(shared_params, options, other_params):
    """
    Build the complete neural network model and return the symbolic variables
    """
    # symbolic variables
    x = tensor.matrix(name="x", dtype=floatX)
    y1 = tensor.iscalar(name="y1")
    y2 = tensor.iscalar(name="y2")

    # lstm cell
    (ht, ct) = lstm_cell(x, shared_params, options, other_params)  # gets the ht, ct
    # softmax 1 i.e. frame type prediction
    activation = tensor.dot(shared_params['softmax1_W'], ht).transpose() + shared_params['softmax1_b']
    frame_pred = tensor.nnet.softmax(activation) # .transpose()

    # softmax 2 i.e. gesture class prediction
    #

    # predicted probability for frame type
    f_pred_prob = theano.function([x], frame_pred, name="f_pred_prob")
    # predicted frame type
    f_pred = theano.function([x], frame_pred.argmax(), name="f_pred")

    # cost
    cost = ifelse(tensor.eq(y1, 1), -tensor.log(frame_pred[0, 0] + options['log_offset'])
                  * other_params['begin_cost_factor'],
                  ifelse(tensor.eq(y1, 2), -tensor.log(frame_pred[0, 1] + options['log_offset'])
                         * other_params['end_cost_factor'],
                         ifelse(tensor.eq(y1, 3), -tensor.log(frame_pred[0, 2] + options['log_offset']),
                                tensor.abs_(tensor.log(y1)))), name='ifelse_cost')

    # function for output of the currect lstm cell and softmax prediction
    f_model_cell_output = theano.function([x], (ht, ct, frame_pred), name="f_model_cell_output")
    # return the model symbolic variables and theano functions
    return x, y1, y2, f_pred_prob, f_pred, cost, f_model_cell_output
Example #8
0
def RankLeftFnIdx_Schema(fnsim, embeddings, prior, leftop, rightop, subtensorspec=None):
    embedding, relationl, relationr = parse_embeddings(embeddings)

    # Inputs
    idxr, idxo = T.iscalar('idxr'), T.iscalar('idxo')
    g = T.matrix('g')

    # Graph
    if subtensorspec is not None:
        # We compute the score only for a subset of entities
        lhs = (embedding.E[:, :subtensorspec]).T
    else:
        lhs = embedding.E.T

    rhs = (embedding.E[:, idxr]).reshape((1, embedding.D))
    rell = (relationl.E[:, idxo]).reshape((1, relationl.D))
    relr = (relationr.E[:, idxo]).reshape((1, relationr.D))

    tmp = rightop(rhs, relr)

    simi = fnsim(leftop(lhs, rell), tmp.reshape((1, tmp.shape[1])))

    pen_simi = g[0, :].T * prior.P[idxo, 0].T + g[1, :].T * prior.P[idxo, 1].T
    simi = simi - pen_simi

    return theano.function([idxr, idxo, g], [simi], on_unused_input='ignore')
Example #9
0
 def getTrainer(self,lossType="NLL"):
   '''
   return a function to do MBSGD on (trainX,trainY)
   '''
   trainY = T.ivector('y')
   alpha = T.dscalar('a')
   lowIdx = T.iscalar()
   highIdx = T.iscalar()
   trainX = T.matrix()
   if lossType=="aNLL":
     loss = self.aNLL(trainY)
   elif lossType=='MSE':
     loss = self.MSE(trainY)
   else:
     loss = self.NLL(trainY)
   dW = T.grad(cost = loss, wrt = self.W)
   db = T.grad(cost = loss, wrt = self.b)
   updates = [(self.W,self.W - alpha * dW), (self.b,self.b - alpha * db)]
   trainer = theano.function(
       inputs = [trainX,trainY,alpha],
       outputs = loss,
       updates=updates,
       givens = {
         self.input : trainX,
       },
       allow_input_downcast=True
     )
   return trainer
Example #10
0
    def __init__(self, in_size, out_size, dim_y, dim_pos, hidden_size_encoder, hidden_size_decoder, cell = "gru", optimizer = "rmsprop", p = 0.5, num_sents = 1):

        self.X = T.matrix("X")
        self.Y_y = T.matrix("Y_y")
        self.Y_pos = T.matrix("Y_pos")
        self.in_size = in_size
        self.out_size = out_size
        self.dim_y = dim_y
        self.dim_pos = dim_pos
        self.hidden_size_encoder = hidden_size_encoder
        self.hidden_size_decoder = hidden_size_decoder
        self.cell = cell
        self.drop_rate = p
        self.num_sents = num_sents
        self.is_train = T.iscalar('is_train') # for dropout
        self.batch_size = T.iscalar('batch_size') # for mini-batch training
        self.mask = T.matrix("mask")
        self.mask_y = T.matrix("mask_y")
        self.optimizer = optimizer
        print "seq2seq out size ", self.out_size
        
        if self.out_size == self.dim_y + self.dim_pos:
            print "size right !"
        self.define_layers()
        self.define_train_test_funcs()
Example #11
0
def multMatVect(v, A, m1, B, m2):
    # TODO : need description for parameter and return
    """
    Multiply the first half of v by A with a modulo of m1 and the second half
    by B with a modulo of m2.

    Notes
    -----
    The parameters of dot_modulo are passed implicitly because passing them
    explicitly takes more time than running the function's C-code.

    """
    if multMatVect.dot_modulo is None:
        A_sym = tensor.lmatrix('A')
        s_sym = tensor.ivector('s')
        m_sym = tensor.iscalar('m')
        A2_sym = tensor.lmatrix('A2')
        s2_sym = tensor.ivector('s2')
        m2_sym = tensor.iscalar('m2')
        o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym)
        multMatVect.dot_modulo = function(
            [A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o, profile=False)

    # This way of calling the Theano fct is done to bypass Theano overhead.
    f = multMatVect.dot_modulo
    f.input_storage[0].storage[0] = A
    f.input_storage[1].storage[0] = v[:3]
    f.input_storage[2].storage[0] = m1
    f.input_storage[3].storage[0] = B
    f.input_storage[4].storage[0] = v[3:]
    f.input_storage[5].storage[0] = m2
    f.fn()
    r = f.output_storage[0].storage[0]

    return r
Example #12
0
    def test_compute_lnZ(self):
        v = T.matrix('v')
        z = T.iscalar('z')

        V = cartesian([(0, 1)] * self.input_size, dtype=config.floatX)
        #H = cartesian([(0, 1)] * self.hidden_size, dtype=config.floatX)

        # We simulate having an infinite number of hidden units by adding lot of hidden units with parameters set to 0.
        nb_hidden_units_to_add = 10000
        model = iRBM(input_size=self.model.input_size,
                     hidden_size=self.model.hidden_size + nb_hidden_units_to_add,
                     beta=self.model.beta.get_value())

        model.W.set_value(np.r_[self.model.W.get_value(), np.zeros((nb_hidden_units_to_add, model.input_size), dtype=theano.config.floatX)])
        model.b.set_value(np.r_[self.model.b.get_value(), np.zeros((nb_hidden_units_to_add,), dtype=theano.config.floatX)])
        model.c.set_value(self.model.c.get_value())

        v = T.matrix('v')
        z = T.iscalar('z')
        F_vz = theano.function([v, z], model.F(v, z))

        energies = []
        for z in range(1, model.hidden_size+1):
            energies.append(F_vz(V, z))

        lnZ = logsumexp(-np.array(energies)).eval()

        lnZ_using_free_energy = theano.function([v], logsumexp(-self.model.free_energy(v)))
        assert_almost_equal(lnZ_using_free_energy(V), lnZ, decimal=5)  # decimal=5 needed for float32
 def compile_functions(self, opt, **args):
     print '... compiling training functions'
     
     gen_cost, gen_show_cost, dis_cost, cost_pfake, cost_ptrue = self.get_cost()
        
     self.opt = opt
     gen_updates = self.opt.get_updates(gen_cost, self.gen_params)
     dis_updates = self.opt.get_updates(dis_cost, self.dis_params)
        
     self.get_noise = theano.function([],
                                      self.theano_rng.uniform(size=(self.batch_size, self.num_z), 
                                             low=-1, high=1)
                                      ) 
     start_index = T.iscalar('start_index')
     end_index = T.iscalar('end_index')
     
     if self.uint8_data:
         given_train_x = T.cast(self.shared_train[start_index:end_index], dtype='float32')
     else:
         given_train_x = self.shared_train[start_index:end_index]
         
     self.train_gen_model = theano.function(
         [self.z],
         gen_show_cost,
         updates=gen_updates,
         )
     
     self.train_dis_model = theano.function(
         [start_index, end_index, self.z],
         [cost_pfake, cost_ptrue],
         updates=dis_updates,
         givens={self.x: given_train_x}
         )
Example #14
0
    def compile(self, model):
        assert isinstance(model, Model)
        self.model = model

        dataset = self.dataset
        X, Y = dataset.preproc(dataset.X, dataset.Y)
        self.X = theano.shared(X, "X")
        self.Y = theano.shared(Y, "Y")

        self.logger.info("compiling do_loglikelihood")
        n_samples = T.iscalar("n_samples")
        batch_idx = T.iscalar("batch_idx")
        batch_size = T.iscalar("batch_size")

        first = batch_idx * batch_size
        last = first + batch_size
        X_batch, Y_batch = dataset.late_preproc(self.X[first:last], self.Y[first:last])

        log_PX, _, _, _, KL, Hp, Hq = model.log_likelihood(X_batch, n_samples=n_samples)
        batch_L = T.sum(log_PX)
        batch_L2 = T.sum(log_PX ** 2)
        batch_KL = [T.sum(kl) for kl in KL]
        batch_Hp = [T.sum(hp) for hp in Hp]
        batch_Hq = [T.sum(hq) for hq in Hq]

        self.do_loglikelihood = theano.function(
            inputs=[batch_idx, batch_size, n_samples],
            outputs=[batch_L, batch_L2] + batch_KL + batch_Hp + batch_Hq,
            name="do_likelihood",
        )
Example #15
0
def compile_bn(data_set, model, make_updates):
    """
    データをsharedにして、modelとoptimizerを使ってcomputational graphを作って、
    コンパイルする。
    
    Parameters
    -----------
    data_set : list of numpy.ndarray
        feature_vec : ndarray
            (n_pixels, D, n_tensors)
        gt_vec : ndarray
            (n_pixels, D)
        test_feature_vec, test_gt_vec
    model : models.Rcn1layerとか
    optimizer : optimizers.SGDとか
    
    """

    s_input, s_target, s_test_input, s_test_target = share_data_sets(*data_set)
        
    nn, obj, train_mse, model_updates, model_param_l = model.make_graph_train()
    test_mse, test_out = model.make_graph_test()
    
    updates, opt_param_list = make_updates(loss=obj, param_list=nn.param_l)

    i_batch = T.iscalar("i_batch")
    index_list = T.ivector("index_list")
    batch_size = T.iscalar("batch_size")
    
    od = OrderedDict()
    for k, e in updates.items() + model_updates.items():
        od[k] = e

    f_train = theano.function(
        inputs=[i_batch, index_list, batch_size]+opt_param_list+model_param_l,
        updates=od,
        givens=[(nn.x_t3, s_input[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]]),
                (nn.t_mat, s_target[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]])],
        on_unused_input='warn')
                
    f_training_error = theano.function(
        inputs=[i_batch, index_list, batch_size]+model_param_l,
        outputs=[train_mse],
        givens=[(nn.x_t3, s_input[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]]),
                (nn.t_mat, s_target[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]])],
        on_unused_input='warn')
                
    f_test_error = theano.function(
        inputs=[i_batch, index_list, batch_size],
        outputs=[test_mse],
        givens=[(nn.x_t3, s_test_input[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]]),
                (nn.t_mat, s_test_target[index_list[i_batch*batch_size: i_batch*batch_size + batch_size]])])
                
    f_output = theano.function(
        inputs=[nn.x_t3],
        outputs=[test_out])

    result = [f_train, f_training_error, f_test_error, f_output, s_input,
              s_target, s_test_input, s_test_target, nn.param_l]
    return result
Example #16
0
def multMatVect(v, A, m1, B, m2):
    """
    multiply the first half of v by A with a modulo of m1
    and the second half by B with a modulo of m2

    Note: The parameters of dot_modulo are passed implicitly because passing
    them explicitly takes more time then running the function's C-code.
    """
    if multMatVect.dot_modulo is None:
        A_sym = tensor.lmatrix("A")
        s_sym = tensor.ivector("s")
        m_sym = tensor.iscalar("m")
        A2_sym = tensor.lmatrix("A2")
        s2_sym = tensor.ivector("s2")
        m2_sym = tensor.iscalar("m2")
        o = DotModulo()(A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym)
        multMatVect.dot_modulo = function([A_sym, s_sym, m_sym, A2_sym, s2_sym, m2_sym], o)

    # This way of calling the Theano fct is done to bypass Theano overhead.
    f = multMatVect.dot_modulo
    f.input_storage[0].storage[0] = A
    f.input_storage[1].storage[0] = v[:3]
    f.input_storage[2].storage[0] = m1
    f.input_storage[3].storage[0] = B
    f.input_storage[4].storage[0] = v[3:]
    f.input_storage[5].storage[0] = m2
    f.fn()
    r = f.output_storage[0].storage[0]

    return r
Example #17
0
    def compile(self, log_pxz, log_qpz, cost, a_pxz):
        batch_idx = T.iscalar()
        learning_rate = T.fscalar()

        updates, norm_grad = self.hp.optimizer(cost, self.params.values(), lr=learning_rate)

        self.outidx = {'cost':0, 'cost_p':1, 'cost_q':2, 'norm_grad':3}
        outputs = [cost, log_pxz, log_qpz]

        self.train = theano.function(inputs=[batch_idx, learning_rate], 
                                     givens={self.X:self.data['tr_X'][batch_idx * self.hp.batch_size : 
                                                                      (batch_idx+1) * self.hp.batch_size]},
                                     outputs=outputs + [norm_grad], updates=updates)
        
        self.validate = theano.function(inputs=[batch_idx], 
                                        givens={self.X:self.data['tr_X'][batch_idx * self.hp.test_batch_size : 
                                                                      (batch_idx+1) * self.hp.test_batch_size]},
                                        outputs=outputs)
        
        self.test = theano.function(inputs=[batch_idx], 
                                    givens={self.X:self.data['te_X'][batch_idx * self.hp.test_batch_size : 
                                                                      (batch_idx+1) * self.hp.test_batch_size]},
                                    outputs=outputs)
        
        n_samples = T.iscalar()

        if self.resample_z:
            self.data['ge_Z'] = srnd.normal((self.max_gen_samples, self.n_z), dtype=theano.config.floatX)
        else:
            self.data['ge_Z'] = shared(np.random.randn(self.max_gen_samples, self.n_z))

        self.decode = theano.function(inputs=[n_samples], 
                                      givens={self.Z:self.data['ge_Z'][:n_samples]}, 
                                      outputs=a_pxz)
Example #18
0
def init_nnet(W, n_classes, vec_dim):
    """Initialize neural network.

    Args:
      W (theano.shared): embedding matrix
      n_classes: number of classes to be predicted
      vec_dim: dimensionality of the embeddings

    """
    w_idx = TT.iscalar(name="w_idx")
    y_gold = TT.iscalar(name="y_gold")
    embs = W[w_idx]
    Theta = theano.shared(value=ORTHOGONAL.sample((n_classes, vec_dim)),
                          name="Theta")
    beta = theano.shared(value=HE_UNIFORM.sample((1, n_classes)), name="beta")
    y_probs = TT.nnet.softmax(TT.dot(Theta, embs.T).flatten() + beta).flatten()
    params = [Theta]
    cost = -TT.mean(TT.log(y_probs[y_gold]))
    updates = sgd_updates_adadelta(params, cost)
    train = theano.function([w_idx, y_gold], cost, updates=updates)
    y_pred = TT.argmax(y_probs)
    y_score = y_probs[y_pred]
    predict = theano.function([w_idx], (y_pred, y_score))
    acc = TT.eq(y_gold, y_pred)
    validate = theano.function([w_idx, y_gold], acc)
    return (train, validate, predict, params)
Example #19
0
def test_multMatVect():
    A1 = tensor.lmatrix('A1')
    s1 = tensor.ivector('s1')
    m1 = tensor.iscalar('m1')
    A2 = tensor.lmatrix('A2')
    s2 = tensor.ivector('s2')
    m2 = tensor.iscalar('m2')

    g0 = rng_mrg.DotModulo()(A1, s1, m1, A2, s2, m2)
    f0 = theano.function([A1, s1, m1, A2, s2, m2], g0)

    i32max = numpy.iinfo(numpy.int32).max

    A1 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
    s1 = numpy.random.randint(0, i32max, 3).astype('int32')
    m1 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")
    A2 = numpy.random.randint(0, i32max, (3, 3)).astype('int64')
    s2 = numpy.random.randint(0, i32max, 3).astype('int32')
    m2 = numpy.asarray(numpy.random.randint(i32max), dtype="int32")

    f0.input_storage[0].storage[0] = A1
    f0.input_storage[1].storage[0] = s1
    f0.input_storage[2].storage[0] = m1
    f0.input_storage[3].storage[0] = A2
    f0.input_storage[4].storage[0] = s2
    f0.input_storage[5].storage[0] = m2

    r_a1 = rng_mrg.matVecModM(A1, s1, m1)
    r_a2 = rng_mrg.matVecModM(A2, s2, m2)
    f0.fn()
    r_b = f0.output_storage[0].value

    assert numpy.allclose(r_a1, r_b[:3])
    assert numpy.allclose(r_a2, r_b[3:])
Example #20
0
def test_clip_grad_int():

    # test that integers don't crash clip gradient
    x = tensor.iscalar()
    y = tensor.iscalar()
    z = tensor.iscalar()
    c = tensor.clip(x, y, z)
    tensor.grad(c, [x, y, z])
Example #21
0
    def __init__(self, rng, state):
        Model.__init__(self)    
        self.state = state
         
        # Compatibility towards older models
        self.__dict__.update(state)
        self.rng = rng 

        # Load dictionary 
        raw_dict = cPickle.load(open(self.dictionary, 'r'))
        
        # Probabilities for each term in the corpus
        self.str_to_idx = dict([(tok, tok_id) for tok, tok_id, _ in raw_dict])
        self.idx_to_str = dict([(tok_id, tok) for tok, tok_id, freq in raw_dict])

        # if '<s>' not in self.str_to_idx \
        #   or '</s>' not in self.str_to_idx:
        #        raise Exception("Error, malformed dictionary!")
         
        # Number of words in the dictionary 
        self.idim = len(self.str_to_idx)
        self.state['idim'] = self.idim
 
        logger.debug("Initializing language model")
        self.language_model = LanguageModel(self.state, self.rng, self)
        
        # Init params
        self.params = self.language_model.params 
        
        self.x_data = T.imatrix('x_data')
        self.x_cost_mask = T.matrix('cost_mask')
        self.x_max_length = T.iscalar('x_max_length')
        
        # The training is done with a trick. We append a special </q> at the beginning of the session
        # so that we can predict also the first query in the session starting from the session beginning token (</q>).
        self.aug_x_data = T.concatenate([T.alloc(np.int32(self.eos_sym), 1, self.x_data.shape[1]), self.x_data])
        self.training_x = self.aug_x_data[:self.x_max_length]
        self.training_y = self.aug_x_data[1:self.x_max_length+1]
        self.training_x_cost_mask = self.x_cost_mask[:self.x_max_length].flatten()
         
        target_probs, self.eval_h = self.language_model.build_lm(self.training_x, 
                                                    y=self.training_y, 
                                                    mode=LanguageModel.EVALUATION)

        # Prediction cost
        self.prediction_cost = T.sum(-T.log(target_probs) * self.training_x_cost_mask) 
        
        # Sampling variables
        self.n_samples = T.iscalar("n_samples")
        self.n_steps = T.iscalar("n_steps")
        (self.sample, self.sample_log_prob), self.sampling_updates \
                = self.language_model.build_sampler(self.n_samples, self.n_steps) 

        # Beam-search variables
        self.beam_source = T.lvector("beam_source")
        self.beam_h = T.matrix("beam_h")
        self.beam_step_num = T.lscalar("beam_step_num")
Example #22
0
def test():
    a = T.iscalar()
    b = T.iscalar()
    c = T.iscalar()
    f1 = theano.function([a , b] , lt(a , b))
    f2 = theano.function([a] , lt2(a) , on_unused_input='ignore')
    print f1(1 , 2)
    print f2(-2)
    print f2(3)
Example #23
0
def SimilarityFunction(fnsim, embeddings, leftop, rightop):
    idxrel = T.iscalar("idxrel")
    idxright = T.iscalar("idxright")
    idxleft = T.iscalar("idxleft")
    lhs = (embeddings.E[:, idxleft]).reshape((1, embeddings.D))
    rhs = (embeddings.E[:, idxright]).reshape((1, embeddings.D))
    rel = (embeddings.E[:, idxrel]).reshape((1, embeddings.D))
    simi = fnsim(leftop(lhs, rel), rightop(rhs, rel))
    return theano.function([idxleft, idxright, idxrel], [simi])
Example #24
0
def EnergyFn(fnsim, embeddings, leftop, rightop):
    embedding, relationl, relationr = parse_embeddings(embeddings)
    idxl, idxo, idxr = T.iscalar('idxl'), T.iscalar('idxo'), T.iscalar('idxr')
    lhs = (embedding.E[:, idxl]).reshape((1, embedding.D))
    rhs = (embedding.E[:, idxr]).reshape((1, embedding.D))
    rell = (relationl.E[:, idxo]).reshape((1, relationl.D))
    relr = (relationr.E[:, idxo]).reshape((1, relationr.D))
    energy = - fnsim(leftop(lhs, rell), rightop(rhs, relr))
    return theano.function([idxl, idxr, idxo], [energy], on_unused_input='ignore')
Example #25
0
 def compile_functions(self, opt, **args):
     print '... compiling training functions'
     
     # propagte for training with batch normalization with upated std and mean for each batch
     self.layer_outputs = self.network_fprop()
     cost, show_cost = self.get_cost()
     self.opt = opt
     updates = self.opt.get_updates(cost, self.params)
     
     # propagate again for validation with fixed mean and std for batch normalization
     self.layer_outputs = self.network_fprop(isTest=True, noiseless=True)
     self.final_output = self.layer_outputs[self.network_structure[-1]['name']]
     errors = self.get_errors()
     
     start_index = T.iscalar('start_index')
     end_index = T.iscalar('end_index')
     
     train_given = {}
     print 'number of training inputs = ', self.ninputs
     for i in xrange(self.ninputs):
         if self.uint8_data:
             train_given[self.xs[i]] = T.cast(self.shared_train[i][start_index:end_index], dtype='float32')
         else:
             train_given[self.xs[i]] = self.shared_train[i][start_index:end_index]
             
         if self.batch_mean_subtraction:
             assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction'
             assert len(self.train_mean) == self.ninputs, 'train_mean need to have the same number as number of inputs'
             train_given[self.xs[i]] -= self.train_mean[i]
         
     train_given[self.y] = self.shared_train_labels[start_index:end_index]
     
     self.train_model = theano.function( inputs=[start_index, end_index], 
                                         outputs=[show_cost, errors], updates = updates,
                                         givens = train_given
                                        )
     
     if hasattr(self, 'shared_valid'):
         valid_given = {}
         for i in xrange(self.ninputs):
             if self.uint8_data:
                 valid_given[self.xs[i]] = T.cast(self.shared_valid[i][start_index:end_index], dtype='float32')
             else:
                 valid_given[self.xs[i]] = self.shared_valid[i][start_index:end_index]
             
             if self.batch_mean_subtraction:
                 assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction'
                 assert len(self.train_mean) == self.ninputs, 'train_mean need to have the same number as number of inputs'
                 valid_given[self.xs[i]] -= self.train_mean[i]
             
         valid_given[self.y] = self.shared_valid_labels[start_index:end_index]
         
         self.validate_model = theano.function( inputs=[start_index,end_index], 
                                                outputs=errors,
                                                 givens = valid_given
                                               )
Example #26
0
def get_train(U_Ot, U_R, lenW, n_facts):
    def phi_x1(x_t, L):
        return T.concatenate([L[x_t].reshape((-1,)), zeros((2*lenW,)), zeros((3,))], axis=0)
    def phi_x2(x_t, L):
        return T.concatenate([zeros((lenW,)), L[x_t].reshape((-1,)), zeros((lenW,)), zeros((3,))], axis=0)
    def phi_y(x_t, L):
        return T.concatenate([zeros((2*lenW,)), L[x_t].reshape((-1,)), zeros((3,))], axis=0)
    def phi_t(x_t, y_t, yp_t, L):
        return T.concatenate([zeros(3*lenW,), T.stack(T.switch(T.lt(x_t,y_t), 1, 0), T.switch(T.lt(x_t,yp_t), 1, 0), T.switch(T.lt(y_t,yp_t), 1, 0))], axis=0)
    def s_Ot(xs, y_t, yp_t, L):
        result, updates = theano.scan(
            lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_Ot.T),
                           T.dot(U_Ot, (phi_y(y_t, L) - phi_y(yp_t, L) + phi_t(x_t, y_t, yp_t, L)))),
            sequences=[xs, T.arange(T.shape(xs)[0])])
        return result.sum()
    def sR(xs, y_t, L, V):
        result, updates = theano.scan(
            lambda x_t, t: T.dot(T.dot(T.switch(T.eq(t, 0), phi_x1(x_t, L).reshape((1,-1)), phi_x2(x_t, L).reshape((1,-1))), U_R.T),
                                 T.dot(U_R, phi_y(y_t, V))),
            sequences=[xs, T.arange(T.shape(xs)[0])])
        return result.sum()

    x_t = T.iscalar('x_t')
    m = [x_t] + [T.iscalar('m_o%d' % i) for i in xrange(n_facts)]
    f = [T.iscalar('f%d_t' % i) for i in xrange(n_facts)]
    r_t = T.iscalar('r_t')
    gamma = T.scalar('gamma')
    L = T.fmatrix('L') # list of messages
    V = T.fmatrix('V') # vocab
    r_args = T.stack(*m)

    cost_arr = [0] * 2 * (len(m)-1)
    updates_arr = [0] * 2 * (len(m)-1)
    for i in xrange(len(m)-1):
        cost_arr[2*i], updates_arr[2*i] = theano.scan(
                lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma - s_Ot(T.stack(*m[:i+1]), f[i], t, L), 0)),
            sequences=[L, T.arange(T.shape(L)[0])])
        cost_arr[2*i+1], updates_arr[2*i+1] = theano.scan(
                lambda f_bar, t: T.switch(T.or_(T.eq(t, f[i]), T.eq(t, T.shape(L)-1)), 0, T.largest(gamma + s_Ot(T.stack(*m[:i+1]), t, f[i], L), 0)),
            sequences=[L, T.arange(T.shape(L)[0])])

    cost1, u1 = theano.scan(
        lambda r_bar, t: T.switch(T.eq(r_t, t), 0, T.largest(gamma - sR(r_args, r_t, L, V) + sR(r_args, t, L, V), 0)),
        sequences=[V, T.arange(T.shape(V)[0])])

    cost = cost1.sum()
    for c in cost_arr:
        cost += c.sum()

    g_uo, g_ur = T.grad(cost, [U_Ot, U_R])

    train = theano.function(
        inputs=[r_t, gamma, L, V] + m + f,
        outputs=[cost],
        updates=[(U_Ot, U_Ot-alpha*g_uo), (U_R, U_R-alpha*g_ur)])
    return train
Example #27
0
def test14():
    x = T.iscalar('x')
    y = T.iscalar('y')
    z = T.arange(x)
    z = T.shape_padaxis(z, axis=1)
    z2 = T.zeros((x,y))
    z2 = z + z2
    fn = theano.function(inputs=[x,y],outputs=[z2],allow_input_downcast=True)
    res = fn(3,4)
    print res, res[0].shape
    def create_gradientfunctions(self, x_train):
        """This function takes as input the whole dataset and creates the entire model"""
        x = T.matrix("x")

        epoch = T.iscalar("epoch")

        batch_size = x.shape[0]

        alpha, beta = self.encoder(x)
        z = self.sampler(alpha, beta)
        reconstructed_x, logpxz = self.decoder(x,z)

        # Expectation of (logpz - logqz_x) over logqz_x is equal to KLD (see appendix B):
        # KLD = 0.5 * T.sum(1 + beta - alpha**2 - T.exp(beta), axis=1, keepdims=True)

        #KLD = 0.5 * T.sum(1 + beta - (alpha**2 + T.exp(beta)) / (2*(self.prior_noise_level**2)) , axis=1, keepdims=True)

        # KLD = cross-entroy of the sample distribution of sigmoid(z) from the beta distribution
        alpha_prior = 1.0/self.prior_noise_level
        beta_prior = 1.0/self.prior_noise_level
        # sigmoidZ = T.nnet.sigmoid(z)
        # KLD = 25*T.sum((alpha_prior-1)*sigmoidZ + (beta-1)*(1-sigmoidZ) - betaln(alpha_prior,beta), axis=1, keepdims=True)
        # KLD = 0

        KLD = -(betaln(alpha, beta) - betaln(alpha_prior, beta_prior) \
         + (alpha_prior - alpha)*T.psi(alpha_prior) + (beta_prior - beta)*T.psi(beta_prior) \
         + (alpha - alpha_prior + beta - beta_prior)*T.psi(alpha_prior+beta_prior))

        # Average over batch dimension
        logpx = T.mean(logpxz + KLD)
 
        rmse_val = rmse_score(x, reconstructed_x)

        # Compute all the gradients
        gradients = T.grad(logpx, self.params.values())

        # Adam implemented as updates
        updates = self.get_adam_updates(gradients, epoch)

        batch = T.iscalar('batch')

        givens = {
            x: x_train[batch*self.batch_size:(batch+1)*self.batch_size, :]
        }

        # Define a bunch of functions for convenience
        update = theano.function([batch, epoch], logpx, updates=updates, givens=givens)
        likelihood = theano.function([x], logpx)
        eval_rmse = theano.function([x], rmse_val)
        encode = theano.function([x], z)
        decode = theano.function([z], reconstructed_x)
        encode_alpha = theano.function([x], alpha)
        encode_beta = theano.function([x], beta)

        return update, likelihood, encode, decode, encode_alpha, encode_beta, eval_rmse
Example #29
0
    def make_theano_evaluator(use_log):
        """This returns a function(!) that calculates the gradient and cost. Heh."""
        X = T.dmatrix('X')
        triplets = T.imatrix('triplets')
        alpha = T.dscalar('alpha')
        lamb = T.dscalar('lambda')
        no_dims = T.iscalar('no_dims')
        N = T.iscalar('N')
        triplets_A = triplets[:,0]
        triplets_B = triplets[:,1]
        triplets_C = triplets[:,2]

        # Compute Student-t kernel. Look familiar?
        sum_X = T.sum(X**2, axis=1)
        a = -2 * (X.dot(X.T))
        b = a + sum_X[np.newaxis,:] + sum_X[:,np.newaxis]
        K = (1 + b / alpha) ** ((alpha+1)/-2)

        # Compute value of cost function
        P = K[triplets_A,triplets_B] / (
            K[triplets_A,triplets_B] +
            K[triplets_A,triplets_C])
        if use_log:
            C = -T.sum(T.log(P)) + lamb * T.sum(X**2)
        else:
            C = -T.sum(P)        + lamb * T.sum(X**2)

        # Compute gradient, for each dimension
        const = (alpha+1) / alpha

        dim = T.iscalar('dim')
        def each_dim(dim):
            if use_log:
                A_to_B =   (1 - P) * K[triplets_A,triplets_B] * (X[triplets_A][:,dim] - X[triplets_B][:,dim])
                B_to_C =   (1 - P) * K[triplets_A,triplets_C] * (X[triplets_A][:,dim] - X[triplets_C][:,dim])
            else:
                A_to_B = P*(1 - P) * K[triplets_A,triplets_B] * (X[triplets_A][:,dim] - X[triplets_B][:,dim])
                B_to_C = P*(1 - P) * K[triplets_A,triplets_C] * (X[triplets_A][:,dim] - X[triplets_C][:,dim])
            this_dim = (-const * T.stack(A_to_B - B_to_C, -A_to_B, B_to_C)).T

            dC = T.extra_ops.bincount(triplets.ravel(),
                                      weights=this_dim.ravel(),
                                      # minlength=N
                                      )
            return -dC + 2*lamb*X[:,dim]

        # loop across all dimensions... theano loops are weird, yes...
        all_dims = (t.scan(each_dim,
                           # non_sequences=N,
                            sequences=T.arange(no_dims))
                   )[0].T

        return t.function([X,N,no_dims,triplets,lamb,alpha],
                          [C, all_dims],
                          on_unused_input='ignore')
Example #30
0
File: MLP.py Project: ybzhou/Gemini
 def compile_functions(self, opt, **args):
     print '... compiling training functions'
     
     # propagte for training with batch normalization with upated std and mean for each batch
     self.layer_outputs = self.network_fprop(self.layers, self.x, self.y)
     cost, show_cost = self.get_cost()
     self.opt = opt
     updates = self.opt.get_updates(cost, self.params)
     
     # propagate again for validation with fixed mean and std for batch normalization
     self.layer_outputs = self.network_fprop(self.layers, self.x, self.y, isTest=True, noiseless=True)
     self.final_output = self.layer_outputs[self.network_structure[-1]['name']]
     errors = self.get_errors()
     
     start_index = T.iscalar('start_index')
     end_index = T.iscalar('end_index')
     
     train_given = {}
     if self.uint8_data:
         given_train_x = T.cast(self.shared_train[start_index:end_index], dtype='float32')
     else:
         given_train_x = self.shared_train[start_index:end_index]
         
     if self.batch_mean_subtraction:
         assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction'
         given_train_x -= self.train_mean
     
     train_given[self.x] = given_train_x
     train_given[self.y] = self.shared_train_labels[start_index:end_index]
     
     self.train_model = theano.function( inputs=[start_index,end_index], 
                                         outputs=show_cost, updates = updates,
                                         givens = train_given
                                        )
     
     
     if hasattr(self, 'shared_valid'):
         valid_given = {}
         if self.uint8_data:
             given_valid_x = T.cast(self.shared_valid[start_index:end_index], dtype='float32')
         else:
             given_valid_x = self.shared_valid[start_index:end_index]
             
         if self.batch_mean_subtraction:
             assert self.train_mean is not None, 'train_mean cannot be None for batch mean subtraction'
             given_valid_x -= self.train_mean
             
         valid_given[self.x] = given_valid_x
             
         valid_given[self.y] = self.shared_valid_labels[start_index:end_index]
         
         self.validate_model = theano.function( inputs=[start_index, end_index], 
                                                outputs=errors,
                                                 givens = valid_given
                                               )
Example #31
0
def create_iter_functions(dataset,
                          output_layer,
                          X_tensor_type=T.matrix,
                          batch_size=BATCH_SIZE,
                          learning_rate=LEARNING_RATE,
                          momentum=MOMENTUM):
    """Create functions for training, validation and testing to iterate one
       epoch.
    """
    batch_index = T.iscalar('batch_index')
    X_batch = X_tensor_type('x')
    y_batch = T.ivector('y')
    batch_slice = slice(batch_index * batch_size,
                        (batch_index + 1) * batch_size)

    objective = lasagne.objectives.Objective(
        output_layer,
        loss_function=lasagne.objectives.categorical_crossentropy)

    loss_train = objective.get_loss(X_batch, target=y_batch)
    loss_eval = objective.get_loss(X_batch, target=y_batch, deterministic=True)

    pred = T.argmax(output_layer.get_output(X_batch, deterministic=True),
                    axis=1)
    accuracy = T.mean(T.eq(pred, y_batch), dtype=theano.config.floatX)

    all_params = lasagne.layers.get_all_params(output_layer)
    updates = lasagne.updates.nesterov_momentum(loss_train, all_params,
                                                learning_rate, momentum)

    iter_train = theano.function(
        [batch_index],
        loss_train,
        updates=updates,
        givens={
            X_batch: dataset['X_train'][batch_slice],
            y_batch: dataset['y_train'][batch_slice],
        },
    )

    iter_valid = theano.function(
        [batch_index],
        [loss_eval, accuracy],
        givens={
            X_batch: dataset['X_valid'][batch_slice],
            y_batch: dataset['y_valid'][batch_slice],
        },
    )

    iter_test = theano.function(
        [batch_index],
        [loss_eval, accuracy],
        givens={
            X_batch: dataset['X_test'][batch_slice],
            y_batch: dataset['y_test'][batch_slice],
        },
    )

    return dict(
        train=iter_train,
        valid=iter_valid,
        test=iter_test,
    )
Example #32
0
def test_tex_print():

    tt_normalrv_noname_expr = tt.scalar("b") * NormalRV(
        tt.scalar("\\mu"), tt.scalar("\\sigma"))
    expected = textwrap.dedent(r"""
    \begin{equation}
      \begin{gathered}
      b \in \mathbb{R}, \,\mu \in \mathbb{R}, \,\sigma \in \mathbb{R}
      \\
      a \sim \operatorname{N}\left(\mu, {\sigma}^{2}\right)\,  \in \mathbb{R}
      \end{gathered}
      \\
      (b \odot a)
    \end{equation}
    """)
    assert tt_tprint(tt_normalrv_noname_expr) == expected.strip()

    tt_normalrv_name_expr = tt.scalar("b") * NormalRV(
        tt.scalar("\\mu"), tt.scalar("\\sigma"), size=[2, 1], name="X")
    expected = textwrap.dedent(r"""
    \begin{equation}
      \begin{gathered}
      b \in \mathbb{R}, \,\mu \in \mathbb{R}, \,\sigma \in \mathbb{R}
      \\
      X \sim \operatorname{N}\left(\mu, {\sigma}^{2}\right)\,  \in \mathbb{R}^{2 \times 1}
      \end{gathered}
      \\
      (b \odot X)
    \end{equation}
    """)
    assert tt_tprint(tt_normalrv_name_expr) == expected.strip()

    tt_2_normalrv_noname_expr = tt.matrix("M") * NormalRV(
        tt.scalar("\\mu_2"), tt.scalar("\\sigma_2"))
    tt_2_normalrv_noname_expr *= tt.scalar("b") * NormalRV(
        tt_2_normalrv_noname_expr, tt.scalar("\\sigma")) + tt.scalar("c")
    expected = textwrap.dedent(r"""
    \begin{equation}
      \begin{gathered}
      M \in \mathbb{R}^{N^{M}_{0} \times N^{M}_{1}}
      \\
      \mu_2 \in \mathbb{R}, \,\sigma_2 \in \mathbb{R}
      \\
      b \in \mathbb{R}, \,\sigma \in \mathbb{R}, \,c \in \mathbb{R}
      \\
      a \sim \operatorname{N}\left(\mu_2, {\sigma_2}^{2}\right)\,  \in \mathbb{R}
      \\
      d \sim \operatorname{N}\left((M \odot a), {\sigma}^{2}\right)\,  \in \mathbb{R}^{N^{d}_{0} \times N^{d}_{1}}
      \end{gathered}
      \\
      ((M \odot a) \odot ((b \odot d) + c))
    \end{equation}
    """)
    assert tt_tprint(tt_2_normalrv_noname_expr) == expected.strip()

    expected = textwrap.dedent(r"""
    \begin{equation}
      \begin{gathered}
      b \in \mathbb{Z}, \,c \in \mathbb{Z}, \,M \in \mathbb{R}^{N^{M}_{0} \times N^{M}_{1}}
      \end{gathered}
      \\
      M\left[b, \,c\right]
    \end{equation}
    """)
    # TODO: "c" should be "1".
    assert (tt_tprint(
        tt.matrix("M")[tt.iscalar("a"),
                       tt.constant(1, dtype="int")]) == expected.strip())

    expected = textwrap.dedent(r"""
    \begin{equation}
      \begin{gathered}
      M \in \mathbb{R}^{N^{M}_{0} \times N^{M}_{1}}
      \end{gathered}
      \\
      M\left[1\right]
    \end{equation}
    """)
    assert tt_tprint(tt.matrix("M")[1]) == expected.strip()

    expected = textwrap.dedent(r"""
    \begin{equation}
      \begin{gathered}
      M \in \mathbb{N}^{N^{M}_{0}}
      \end{gathered}
      \\
      M\left[2:4:0\right]
    \end{equation}
    """)
    assert tt_tprint(tt.vector("M", dtype="uint32")[0:4:2]) == expected.strip()

    norm_rv = NormalRV(tt.scalar("\\mu"), tt.scalar("\\sigma"))
    rv_obs = observed(tt.constant(1.0, dtype=norm_rv.dtype), norm_rv)

    expected = textwrap.dedent(r"""
    \begin{equation}
      \begin{gathered}
      \mu \in \mathbb{R}, \,\sigma \in \mathbb{R}
      \\
      a \sim \operatorname{N}\left(\mu, {\sigma}^{2}\right)\,  \in \mathbb{R}
      \end{gathered}
      \\
      a = 1.0
    \end{equation}
        """)
    assert tt_tprint(rv_obs) == expected.strip()
Example #33
0
from __future__ import absolute_import, print_function, division
import theano
import theano.tensor as tt
from six.moves import xrange

k = tt.iscalar("k")
A = tt.vector("A")


def inner_fct(prior_result, A):
    return prior_result * A


# Symbolic description of the result
result, updates = theano.scan(fn=inner_fct,
                              outputs_info=tt.ones_like(A),
                              non_sequences=A,
                              n_steps=k)

# Scan has provided us with A**1 through A**k.  Keep only the last
# value. Scan notices this and does not waste memory saving them.
final_result = result[-1]

power = theano.function(inputs=[A, k], outputs=final_result, updates=updates)

print(power(list(range(10)), 2))
#[  0.   1.   4.   9.  16.  25.  36.  49.  64.  81.]
Example #34
0
    def test_value_h(self):

        "tests that the value of the kl divergence decreases with each update to h_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        init_H = e_step.init_H_hat(V=X)
        init_Mu1 = e_step.init_S_hat(V=X)

        prev_setting = config.compute_test_value
        config.compute_test_value = 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(
            -5., 5., Mu1.shape))

        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0

        newH = e_step.infer_H_hat(V=X, H_hat=H_var, S_hat=Mu1_var)

        h_idx = newH[:, idx]

        h_i_func = function([H_var, Mu1_var, idx], h_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1)

        trunc_kl_func = function([H_var, Mu1_var], trunc_kl)

        for i in xrange(self.N):
            prev_kl = trunc_kl_func(H, Mu1)

            H[:, i] = h_i_func(H, Mu1, i)
            #we don't update mu, the whole point of the split e step is we don't have to

            new_kl = trunc_kl_func(H, Mu1)

            increase = new_kl - prev_kl

            print 'failures after iteration ', i, ': ', (increase >
                                                         self.tol).sum()

            mx = increase.max()

            if mx > 1e-4:
                print 'increase amounts of failing examples:'
                print increase[increase > self.tol]
                print 'failing H:'
                print H[increase > self.tol, :]
                print 'failing Mu1:'
                print Mu1[increase > self.tol, :]
                print 'failing V:'
                print X[increase > self.tol, :]

                raise Exception(
                    'after mean field step in h, kl divergence should decrease, but some elements increased by as much as '
                    + str(mx) + ' after updating h_' + str(i))
Example #35
0
    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on a
        batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                        the has to contain three pairs, `train`,
                        `valid`, `test` in this order, where each pair
                        is formed of two Theano variables, one for the
                        datapoints, the other for the labels
        :type batch_size: int
        :param batch_size: size of a minibatch
        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage

        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        # n_train_size = train_set_y.get_value(borrow=True).shape[0]

        n_valid_batches = valid_set_y.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_y.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.iscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))

#shift the indeces in Y for each mini batch
        offset = theano.shared(value=numpy.asarray(
            [[1, 1, 0] for i in range(batch_size)], dtype='int32'),
                               name='offset')

        train_fn = theano.function(
            inputs=[index],
            on_unused_input='ignore',
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[train_set_y[index * batch_size][0]:train_set_y[
                    (index + 1) * batch_size][0]],
                #index * batch_size:
                #(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size] -
                offset * train_set_y[index * batch_size][0]
            })

        test_score_i = theano.function(
            [index],
            self.errors,
            on_unused_input='ignore',
            givens={
                self.x:
                test_set_x[test_set_y[index * batch_size][0]:test_set_y[
                    (index + 1) * batch_size][0]],
                #index * batch_size:
                #(index + 1) * batch_size],
                self.y:
                test_set_y[index * batch_size:(index + 1) * batch_size] -
                offset * test_set_y[index * batch_size][0]
            })

        valid_score_i = theano.function(
            [index],
            self.errors,
            on_unused_input='ignore',
            givens={
                self.x:
                valid_set_x[valid_set_y[index * batch_size][0]:valid_set_y[
                    (index + 1) * batch_size][0]],
                #index * batch_size:
                #(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size] -
                offset * valid_set_y[index * batch_size][0]
            })

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches - 1)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches - 1)]

        return train_fn, valid_score, test_score
                         initialization='he',
                         weightnorm=WEIGHT_NORM)
    out = T.nnet.relu(out)

    # Output
    # We apply the softmax later
    out = lib.ops.Linear('SampleLevel.Output',
                         DIM,
                         Q_LEVELS,
                         out,
                         weightnorm=WEIGHT_NORM)
    return out

sequences = T.imatrix('sequences')
h0        = T.tensor3('h0')
reset     = T.iscalar('reset')
mask      = T.matrix('mask')

if args.debug:
    # Solely for debugging purposes.
    # Maybe I should set the compute_test_value=warn from here.
    sequences.tag.test_value = numpy.zeros((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='int32')
    h0.tag.test_value = numpy.zeros((BATCH_SIZE, N_RNN, H0_MULT*DIM), dtype='float32')
    reset.tag.test_value = numpy.array(1, dtype='int32')
    mask.tag.test_value = numpy.ones((BATCH_SIZE, SEQ_LEN+OVERLAP), dtype='float32')

input_sequences = sequences[:, :-FRAME_SIZE]
target_sequences = sequences[:, FRAME_SIZE:]

target_mask = mask[:, FRAME_SIZE:]
Example #37
0
    def __init__(self,
                 Nlayers = 1,               # number of layers
                 Ndirs = 1,                 # unidirectional or bidirectional
                 Nx = 100,                  # input size
                 Nh = 100,                  # hidden layer size
                 Ny = 100,                  # output size
                 Ah = "relu",               # hidden unit activation (e.g. relu, tanh, lstm)
                 Ay = "linear",             # output unit activation (e.g. linear, sigmoid, softmax)
                 predictPer = "frame",      # frame or sequence
                 loss = None,               # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge)
                 L1reg = 0.0,               # L1 regularization
                 L2reg = 0.0,               # L2 regularization
                 multiReg = 0.0,            # regularization of agreement of predictions on data of different conditions
                 momentum = 0.0,            # SGD momentum
                 seed = 15213,              # random seed for initializing the weights
                 frontEnd = None,           # a lambda function for transforming the input
                 filename = None,           # initialize from file
                 initParams = None,         # initialize from given dict
                ):

        if filename is not None:            # load parameters from file
            with smart_open(filename, "rb") as f:
                initParams = dill.load(f)
        if initParams is not None:          # load parameters from given dict
            self.paramNames = []
            self.params = []
            for k, v in initParams.iteritems():
                if type(v) is numpy.ndarray:
                    self.addParam(k, v)
                else:
                    setattr(self, k, v)
                    self.paramNames.append(k)
            # F*ck, locals()[k] = v doesn't work; I have to do this statically
            Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \
                = self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd
        else:                           # Initialize parameters randomly
            # Names of parameters to save to file
            self.paramNames = ["Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd"]
            for name in self.paramNames:
                value = locals()[name]
                setattr(self, name, value)

            # Values of parameters for building the computational graph
            self.params = []

            # Initialize random number generators
            global rng
            rng = numpy.random.RandomState(seed)

            # Construct parameter matrices
            Nlstm = 4 if Ah == 'lstm' else 1
            self.addParam("Win", rand_init((Nx, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wrec", rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah))
            self.addParam("Wup", rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah))
            self.addParam("Wout", rand_init((Nh * Ndirs, Ny), Ay))
            if Ah != "lstm":
                self.addParam("Bhid", zeros((Nlayers, Nh * Ndirs)))
            else:
                self.addParam("Bhid", numpy.tile(numpy.hstack([full((Nlayers, Nh), 1.0), zeros((Nlayers, Nh * 3))]), (1, Ndirs)))
            self.addParam("Bout", zeros(Ny))
            self.addParam("h0", zeros((Nlayers, Ndirs, Nh)))
            if Ah == "lstm":
                self.addParam("c0", zeros((Nlayers, Ndirs, Nh)))

        # Compute total number of parameters
        self.nParams = sum(x.get_value().size for x in self.params)

        # Initialize gradient tensors when using momentum
        if momentum > 0:
            self.dparams = [theano.shared(zeros(x.get_value().shape)) for x in self.params]

        # Build computation graph
        input = T.ftensor4()    # stream * time * feature
        mask = T.imatrix()
        mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()]
        mask_float = [T.cast((mask % 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
                      T.cast((mask >= 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]
        # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()]
        # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX),
        #               T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)]

        def step_rnn(x_t, mask, h_tm1, W, h0):
            h_tm1 = T.switch(mask, h0, h_tm1)
            return [ACTIVATION[Ah](x_t + h_tm1.dot(W))]

        def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0):
            c_tm1 = T.switch(mask, c0, c_tm1)
            h_tm1 = T.switch(mask, h0, h_tm1)
            a = x_t + h_tm1.dot(W)
            f_t = T.nnet.sigmoid(a[:, :, :Nh])
            i_t = T.nnet.sigmoid(a[:, :, Nh : Nh * 2])
            o_t = T.nnet.sigmoid(a[:, :, Nh * 2 : Nh * 3])
            c_t = T.tanh(a[:, :, Nh * 3:]) * i_t + c_tm1 * f_t
            h_t = T.tanh(c_t) * o_t
            return [c_t, h_t]

        x = input if frontEnd is None else frontEnd(input)
        for i in range(Nlayers):
            h = (x.dimshuffle((2, 1, 0, 3)).dot(self.Win) if i == 0 else h.dot(self.Wup[i-1])) + self.Bhid[i]
                # (2, 1, 0, 3): condition * stream * time * feature => time * stream * condition * feature
            rep = lambda x: T.extra_ops.repeat(T.extra_ops.repeat(x.reshape((1, 1, -1)), h.shape[2], axis = 1), h.shape[1], axis = 0)
            if Ah != "lstm":
                h = T.concatenate([theano.scan(
                        fn = step_rnn,
                        sequences = [h[:, :, :, Nh * d : Nh * (d+1)], mask_float[d]],
                        outputs_info = [rep(self.h0[i, d])],
                        non_sequences = [self.Wrec[i, d], rep(self.h0[i, d])],
                        go_backwards = (d == 1),
                    )[0][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 3)
            else:
                h = T.concatenate([theano.scan(
                        fn = step_lstm,
                        sequences = [h[:, :, :, Nh * 4 * d : Nh * 4 * (d+1)], mask_float[d]],
                        outputs_info = [rep(self.c0[i, d]), rep(self.h0[i, d])],
                        non_sequences = [self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d])],
                        go_backwards = (d == 1),
                    )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs)], axis = 3)
        if predictPer == "sequence":
            h = h.dimshuffle((1, 0, 2, 3))  # time * stream * condition * feature => stream * time * condition * feature
            h = T.concatenate([h[mask_int[1 - d]][:, :, Nh * d : Nh * (d+1)] for d in range(Ndirs)], axis = 2)  # sequence * condition * feature
            h = h.dimshuffle((1, 0, 2))     # sequence * condition * feature => condition * sequence * feature
        else:
            h = h.dimshuffle((2, 1, 0, 3))  # time * stream * condition * feature => condition * stream * time * feature
        output = ACTIVATION[Ay](h.dot(self.Wout) + self.Bout)
        output_mean = output.mean(axis = 0)
        output_var = output.var(axis = 0)

        # Compute loss function
        if loss is None:
            loss = {"linear": "mse", "sigmoid": "ce", "softmax": "ce_group"}[self.Ay]
        if loss == "ctc":
            label = T.imatrix()
            label_time = T.imatrix()
            tol = T.iscalar()
            cost = theano.scan(fn = lambda prob: ctc_cost(prob, mask, label, label_time, tol), \
                               sequences = [output])[0].mean()
        else:
            if predictPer == "sequence":
                label = T.fmatrix()
                y = output_mean
                t = label
            elif predictPer == "frame":
                label = T.ftensor3()
                indices = (mask >= 0).nonzero()
                y = output_mean[indices]
                t = label[indices]
            cost = T.mean({
                "ce":               -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis = 1),
                "ce_group":         -T.log((y * t).sum(axis = 1)),
                "mse":              T.mean((y - t) ** 2, axis = 1),
                "hinge":            T.mean(relu(1 - y * (t * 2 - 1)), axis = 1),
                "squared_hinge":    T.mean(relu(1 - y * (t * 2 - 1)) ** 2, axis = 1),
            }[loss])

        # Add regularization
        cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg
        cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg
        if predictPer == "sequence":
            cost += output_var.mean() * multiReg
        else:
            indices = (mask >= 0).nonzero()
            cost += output_var[indices].mean() * multiReg

        # Compute updates for network parameters
        updates = []
        lrate = T.fscalar()
        clip = T.fscalar()
        grad = T.grad(cost, self.params)
        grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad]
        if momentum > 0:
            for w, d, g in zip(self.params, self.dparams, grad_clipped):
                updates.append((w, w + momentum * momentum * d - (1 + momentum) * lrate * g))
                updates.append((d, momentum * d - lrate * g))
        else:
            for w, g in zip(self.params, grad_clipped):
                updates.append((w, w - lrate * g))

        # Create functions to be called from outside
        if loss == "ctc":
            inputs = [input, mask, label, label_time, tol, lrate, clip]
        else:
            inputs = [input, mask, label, lrate, clip]
        self.train = theano.function(
                         inputs = inputs,
                         outputs = cost,
                         updates = updates,
                     )

        self.predict = theano.function(inputs = [input, mask], outputs = output)
Example #38
0
train_freq_print = 20
valid_freq_print = 500
sample_strings = ['i am alien lamp and i love the neural nets'] * 50#['Sous le pont Mirabeau coule la Seine.']*50
algo = 'adam'  # adam, sgd

#model_file_load = "/u/lambalex/models/handwriting/handwriting/71535347/saved_model.pkl"
#model_file_load = "/u/lambalex/models/handwriting/handwriting/81356894/saved_model.pkl"
#model_file_load = "/u/lambalex/models/handwriting/handwriting/10406114/saved_model.pkl"
#model_file_load = "saved_model.pkl"
#model_file_load = "/u/lambalex/models/handwriting/handwriting/33757048/saved_model.pkl"
model_file_load = None
#model_file_load = "/u/lambalex/models/handwriting/handwriting/90207341/saved_model.pkl"
#model_file_load = "/u/lambalex/models/handwriting/handwriting/11151138/saved_model.pkl"
#model_file_load = "/u/lambalex/models/handwriting_pf/handwriting/52780486/saved_model.pkl"

num_steps_sample = T.iscalar('num_steps_sample')

exp_id = np.random.randint(0, 100000000, 1)[0]

dump_path = os.path.join(os.environ.get('TMP_PATH'), 'handwriting',str(exp_id))

os.umask(055)

os.makedirs(dump_path, 0777)

os.makedirs(dump_path + "/src", 0777)
os.chmod(dump_path, 0o777)

fh = open(dump_path + "/derpy_file.txt", "w")
fh.write("DERP DERP DERP DERP")
fh.close()
	def __init__(self,  We_initial, params):
		#self.textfile = open(params.outfile, 'w')
		We = theano.shared(We_initial)
               

        	embsize = We_initial.shape[1]
        	hidden = params.hidden

                input_init = np.random.uniform(-0.1, 0.1, (10, MAX_lENGTH, params.num_labels)).astype('float32')
                self.input_init = theano.shared(input_init)

		

		input_var = T.imatrix(name='inputs')
        	target_var = T.imatrix(name='targets')
        	mask_var = T.fmatrix(name='masks')
		mask_var1 = T.fmatrix(name='masks1')
		length = T.iscalar()
			
                                

		Wyy0 = np.random.uniform(-0.02, 0.02, (params.num_labels + 1, params.num_labels)).astype('float32')
                Wyy = theano.shared(Wyy0)
 

                l_in_word = lasagne.layers.InputLayer((None, None))
                l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

		if params.emb ==1:
                        l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word,  input_size= We_initial.shape[0] , output_size = embsize, W =We)
                else:
                        l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)


                
		l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word)
        	l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True)

        	concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)
		
		l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden))

		l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= params.num_labels, nonlinearity=lasagne.nonlinearities.linear)

		
		network_params = lasagne.layers.get_all_params(l_local, trainable=True)
                network_params.append(Wyy)

		
		#print len(network_params)
		f = open('ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle','r')
		data = pickle.load(f)
		f.close()

		for idx, p in enumerate(network_params):

                        p.set_value(data[idx])

	
	
                
	
		def inner_function( targets_one_step, mask_one_step,  prev_label, tg_energy):
                        """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """                 
                        new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1])
                        new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1)
			tg_energy_t = T.switch(mask_one_step, new_ta_energy_t,  tg_energy)

                        return [targets_one_step, tg_energy_t]


		local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var})
		local_energy = local_energy.reshape((-1, length, params.num_labels))
                local_energy = local_energy*mask_var[:,:,None]		

		#####################
		# for the end symbole of a sequence
		####################

		end_term = Wyy[:-1,-1]
                local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None]

                
                predy_init = self.input_init[:,:length,:]

                a_params = [self.input_init]


                predy = T.nnet.softmax(predy_init.reshape((-1, params.num_labels)))
                predy = predy.reshape((-1, length, params.num_labels))

                prediction = T.argmax(predy_init, axis=2)

                predy = predy*mask_var[:,:,None]

		
		
		targets_shuffled = predy.dimshuffle(1, 0, 2)
                target_time0 = targets_shuffled[0]
		
		masks_shuffled = mask_var.dimshuffle(1, 0)		 

                initial_energy0 = T.dot(target_time0, Wyy[-1,:-1])


                initials = [target_time0, initial_energy0]
                [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]])
                cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1)
				
		
		predy_f =  predy.reshape((-1, params.num_labels))
		y_f = target_var.flatten()

	
		if (params.annealing ==0):
                        lamb = params.L3
                elif (params.annealing ==1):
                        lamb = params.L3* (1 - 0.01*t_t)


		cost = T.mean(-cost11)
                		   

		#from adam import adam
                #updates_a = adam(cost, a_params, params.eta)
					
		updates_a = lasagne.updates.sgd(cost, a_params, params.eta)
                updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9)

	        self.inf_fn = theano.function([input_var, mask_var, mask_var1, length], cost, updates = updates_a)
                self.eval_fn = theano.function([input_var, mask_var, mask_var1, length], [prediction, -cost11], on_unused_input='ignore')	
config.lr_decay = 1.02
config.weight_decay = 1e-7
config.max_grad_norm = 10
config.num_steps = 35
config.max_epoch = 20  # number of epochs after which learning decay starts
config.drop_x = 0.25  # variational dropout rate over input word embeddings
config.drop_i = 0.75  # variational dropout rate over inputs of RHN layers(s), applied seperately in each RHN layer
config.drop_s = 0.25  # variational dropout rate over recurrent state
config.drop_o = 0.75  # variational dropout rate over outputs of RHN layer(s), applied before classification layer
config.vocab_size = 10000

print("Data loading")
train_data, valid_data, test_data, _ = ptb_raw_data(config.data_path)

print('Compiling model')
_is_training = T.iscalar('is_training')
_lr = theano.shared(cast_floatX(config.learning_rate), 'lr')
_input_data = T.imatrix('input_data')  # (batch_size, num_steps)
_noise_x = T.matrix('noise_x')  # (batch_size, num_steps)

# model
_theano_rng = RandomStreams(config.seed // 2 +
                            321)  # generates random numbers directly on GPU
flat_probs, params, rhn_updates, hidden_states = stacked.model(
    _input_data, _noise_x, _lr, _is_training, config, _theano_rng)

# loss
_targets = T.imatrix('targets')  # (batch_size, num_steps)
flat_targets = _targets.T.flatten()
xentropies = T.nnet.categorical_crossentropy(
    flat_probs, flat_targets)  # (batch_size * num_steps,)
    # We apply the softmax later
    out = lib.ops.Linear('SampleLevel.Output',
                         DIM,
                         Q_LEVELS,
                         out,
                         weightnorm=WEIGHT_NORM)
    return out


sequences_8k = T.imatrix('sequences_8k')  #batch size*samplenum
sequences_up = T.imatrix('sequences_up')
condition = T.matrix('con')
con_h0 = T.tensor3('con_h0')
h0 = T.tensor3('h0')  #(batch size, N_RNN, DIM)
big_h0 = T.tensor3('big_h0')  #(batch size, N_BIG_RNN, BIG_DIM)
reset = T.iscalar('reset')
mask = T.matrix('mask')  #batch size*samplenum
batch_size = T.iscalar('batch_size')
lr = T.scalar('lr')

con_input_sequences = condition

big_input_sequences = sequences_8k  #The last BIG_FRAME_SIZE frames do not need (tier3)
big_input_sequences = big_input_sequences.reshape((1, batch_size, 1, -1))
big_input_sequences = T.nnet.neighbours.images2neibs(big_input_sequences,
                                                     (1, 2 * OVERLAP),
                                                     neib_step=(1, OVERLAP),
                                                     mode='valid')
big_input_sequences = big_input_sequences.reshape((batch_size, -1))

input_sequences = sequences_8k[:, 0:-(OVERLAP - FRAME_SIZE)]  #(tier2)
def create_model(s1_ae,
                 s2_ae,
                 s3_ae,
                 s1_shape,
                 s1_var,
                 s2_shape,
                 s2_var,
                 s3_shape,
                 s3_var,
                 mask_shape,
                 mask_var,
                 lstm_size=250,
                 win=T.iscalar('theta)'),
                 output_classes=26,
                 fusiontype='concat',
                 w_init_fn=las.init.Orthogonal(),
                 use_peepholes=True):

    s1_bn_weights, s1_bn_biases, s1_bn_shapes, s1_bn_nonlinearities = s1_ae
    s2_weights, s2_biases, s2_shapes, s2_nonlinearities = s2_ae
    s3_weights, s3_biases, s3_shapes, s3_nonlinearities = s3_ae

    gate_parameters = Gate(W_in=w_init_fn,
                           W_hid=w_init_fn,
                           b=las.init.Constant(0.))
    cell_parameters = Gate(
        W_in=w_init_fn,
        W_hid=w_init_fn,
        # Setting W_cell to None denotes that no cell connection will be used.
        W_cell=None,
        b=las.init.Constant(0.),
        # By convention, the cell nonlinearity is tanh in an LSTM.
        nonlinearity=tanh)

    l_s1 = InputLayer(s1_shape, s1_var, 's1_im')
    l_mask = InputLayer(mask_shape, mask_var, 'mask')
    l_s2 = InputLayer(s2_shape, s2_var, 's2_im')
    l_s3 = InputLayer(s3_shape, s3_var, 's3_im')

    symbolic_batchsize_s1 = l_s1.input_var.shape[0]
    symbolic_seqlen_s1 = l_s1.input_var.shape[1]
    symbolic_batchsize_s2 = l_s2.input_var.shape[0]
    symbolic_seqlen_s2 = l_s2.input_var.shape[1]
    symbolic_batchsize_s3 = l_s3.input_var.shape[0]
    symbolic_seqlen_s3 = l_s3.input_var.shape[1]

    l_reshape1_s1 = ReshapeLayer(l_s1, (-1, s1_shape[-1]), name='reshape1_s1')
    l_encoder_s1 = create_pretrained_encoder(
        l_reshape1_s1, s1_bn_weights, s1_bn_biases, s1_bn_shapes,
        s1_bn_nonlinearities, ['fc1_s1', 'fc2_s1', 'fc3_s1', 'bottleneck_s1'])
    s1_len = las.layers.get_output_shape(l_encoder_s1)[-1]

    l_reshape2_s1 = ReshapeLayer(
        l_encoder_s1, (symbolic_batchsize_s1, symbolic_seqlen_s1, s1_len),
        name='reshape2_s1')
    l_delta_s1 = DeltaLayer(l_reshape2_s1, win, name='delta_s1')
    l_delta_s1_dropout = DropoutLayer(l_delta_s1, name='dropout_s1')

    # s2 images
    l_reshape1_s2 = ReshapeLayer(l_s2, (-1, s2_shape[-1]), name='reshape1_s2')
    l_encoder_s2 = create_pretrained_encoder(
        l_reshape1_s2, s2_weights, s2_biases, s2_shapes, s2_nonlinearities,
        ['fc1_s2', 'fc2_s2', 'fc3_s2', 'bottleneck_s2'])
    s2_len = las.layers.get_output_shape(l_encoder_s2)[-1]
    l_reshape2_s2 = ReshapeLayer(
        l_encoder_s2, (symbolic_batchsize_s2, symbolic_seqlen_s2, s2_len),
        name='reshape2_s2')
    l_delta_s2 = DeltaLayer(l_reshape2_s2, win, name='delta_s2')
    l_delta_s2_dropout = DropoutLayer(l_delta_s2, name='dropout_s2')

    # s3 images
    l_reshape1_s3 = ReshapeLayer(l_s3, (-1, s3_shape[-1]), name='reshape1_s3')
    l_encoder_s3 = create_pretrained_encoder(
        l_reshape1_s3, s3_weights, s3_biases, s3_shapes, s3_nonlinearities,
        ['fc1_s3', 'fc2_s3', 'fc3_s3', 'bottleneck_s3'])
    s3_len = las.layers.get_output_shape(l_encoder_s3)[-1]
    l_reshape2_s3 = ReshapeLayer(
        l_encoder_s3, (symbolic_batchsize_s3, symbolic_seqlen_s3, s3_len),
        name='reshape2_s3')
    l_delta_s3 = DeltaLayer(l_reshape2_s3, win, name='delta_s3')
    l_delta_s3_dropout = DropoutLayer(l_delta_s3, name='dropout_s3')

    l_lstm_s1 = LSTMLayer(
        l_delta_s1_dropout,
        lstm_size * 2,
        peepholes=use_peepholes,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters,
        forgetgate=gate_parameters,
        cell=cell_parameters,
        outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True,
        grad_clipping=5.,
        name='lstm_s1')

    l_lstm_s2 = LSTMLayer(
        l_delta_s2_dropout,
        lstm_size * 2,
        peepholes=use_peepholes,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters,
        forgetgate=gate_parameters,
        cell=cell_parameters,
        outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True,
        grad_clipping=5.,
        name='lstm_s2')

    l_lstm_s3 = LSTMLayer(
        l_delta_s3_dropout,
        lstm_size * 2,
        peepholes=use_peepholes,
        # We need to specify a separate input for masks
        mask_input=l_mask,
        # Here, we supply the gate parameters for each gate
        ingate=gate_parameters,
        forgetgate=gate_parameters,
        cell=cell_parameters,
        outgate=gate_parameters,
        # We'll learn the initialization and use gradient clipping
        learn_init=True,
        grad_clipping=5.,
        name='lstm_s3')

    # We'll combine the forward and backward layer output by summing.
    # Merge layers take in lists of layers to merge as input.
    if fusiontype == 'adasum':
        l_fuse = AdaptiveElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3],
                                          name='adasum1')
    elif fusiontype == 'sum':
        l_fuse = ElemwiseSumLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3],
                                  name='sum1')
    elif fusiontype == 'concat':
        l_fuse = ConcatLayer([l_lstm_s1, l_lstm_s2, l_lstm_s3],
                             axis=-1,
                             name='concat')

    l_fuse_dropout = DropoutLayer(l_fuse, name='concat_dropout')
    f_lstm_agg, b_lstm_agg = create_blstm(l_fuse_dropout, l_mask,
                                          lstm_size * 2, cell_parameters,
                                          gate_parameters, 'lstm_agg')
    l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2')

    # reshape to (num_examples * seq_len, lstm_size)
    l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size * 2), name='reshape3')

    # Now, we can apply feed-forward layers as usual.
    # We want the network to predict a classification for the sequence,
    # so we'll use a the number of classes.
    l_softmax = DenseLayer(l_reshape3,
                           num_units=output_classes,
                           nonlinearity=las.nonlinearities.softmax,
                           name='softmax')

    l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_s1, output_classes),
                         name='output')

    return l_out, l_fuse
Example #43
0
    def test_grad_s(self):

        "tests that the gradients with respect to s_i are 0 after doing a mean field update of s_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        model.test_batch_size = X.shape[0]

        init_H = e_step.init_H_hat(V=X)
        init_Mu1 = e_step.init_S_hat(V=X)

        prev_setting = config.compute_test_value
        config.compute_test_value = 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(
            -5., 5., Mu1.shape))

        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0

        S = e_step.infer_S_hat(V=X, H_hat=H_var, S_hat=Mu1_var)

        s_idx = S[:, idx]

        s_i_func = function([H_var, Mu1_var, idx], s_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1)

        grad_Mu1 = T.grad(trunc_kl.sum(), Mu1_var)

        grad_Mu1_idx = grad_Mu1[:, idx]

        grad_func = function([H_var, Mu1_var, idx], grad_Mu1_idx)

        for i in xrange(self.N):
            Mu1[:, i] = s_i_func(H, Mu1, i)

            g = grad_func(H, Mu1, i)

            assert not contains_nan(g)

            g_abs_max = np.abs(g).max()

            if g_abs_max > self.tol:
                raise Exception(
                    'after mean field step, gradient of kl divergence wrt mean field parameter should be 0, but here the max magnitude of a gradient element is '
                    + str(g_abs_max) + ' after updating s_' + str(i))
Example #44
0
def jobman(state, channel):
    # load dataset
    state['null_sym_source'] = 15000
    state['null_sym_target'] = 15000
    state['n_sym_source'] = state['null_sym_source'] + 1
    state['n_sym_target'] = state['null_sym_target'] + 1

    state['nouts'] = state['n_sym_target']
    state['nins'] = state['n_sym_source']
    rng = numpy.random.RandomState(state['seed'])
    if state['loopIters'] > 0:
        train_data, valid_data, test_data = get_data(state)
    else:
        train_data = None
        valid_data = None
        test_data = None

    ########### Training graph #####################
    ## 1. Inputs
    if state['bs'] == 1:
        x = TT.lvector('x')
        x_mask = TT.vector('x_mask')
        y = TT.lvector('y')
        y0 = y
        y_mask = TT.vector('y_mask')
    else:
        x = TT.lmatrix('x')
        x_mask = TT.matrix('x_mask')
        y = TT.lmatrix('y')
        y0 = y
        y_mask = TT.matrix('y_mask')

    # 2. Layers and Operators
    bs = state['bs']

    embdim = state['dim_mlp']

    # Source Sentence
    emb = MultiLayer(rng,
                     n_in=state['nins'],
                     n_hids=[state['rank_n_approx']],
                     activation=[state['rank_n_activ']],
                     init_fn=state['weight_init_fn'],
                     weight_noise=state['weight_noise'],
                     scale=state['weight_scale'],
                     name='emb')

    emb_words = []
    if state['rec_gating']:
        gater_words = []
    if state['rec_reseting']:
        reseter_words = []
    for si in xrange(state['encoder_stack']):
        emb_words.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_%d' % si))
        if state['rec_gating']:
            gater_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_%d' % si))
        if state['rec_reseting']:
            reseter_words.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_%d' % si))

    add_rec_step = []
    rec_proj = []
    if state['rec_gating']:
        rec_proj_gater = []
    if state['rec_reseting']:
        rec_proj_reseter = []
    for si in xrange(state['encoder_stack']):
        if si > 0:
            rec_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_reseter_%d' % si))

        add_rec_step.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_%d' % si))

    def _add_op(words_embeddings,
                words_mask=None,
                prev_val=None,
                si=0,
                state_below=None,
                gater_below=None,
                reseter_below=None,
                one_step=False,
                bs=1,
                init_state=None,
                use_noise=True):
        seqlen = words_embeddings.out.shape[0] // bs
        rval = words_embeddings
        gater = None
        reseter = None
        if state['rec_gating']:
            gater = gater_below
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            rval += rec_proj[si - 1](state_below,
                                     one_step=one_step,
                                     use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_gater[si - 1](state_below,
                                               one_step=one_step,
                                               use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_reseter[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg

        if not one_step:
            rval = add_rec_step[si](rval,
                                    nsteps=seqlen,
                                    batch_size=bs,
                                    mask=words_mask,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        else:
            rval = add_rec_step[si](rval,
                                    mask=words_mask,
                                    state_before=prev_val,
                                    gater_below=gater,
                                    reseter_below=reseter,
                                    one_step=one_step,
                                    init_state=init_state,
                                    use_noise=use_noise)
        return rval

    add_op = Operator(_add_op)

    # Target Sentence
    emb_t = MultiLayer(rng,
                       n_in=state['nouts'],
                       n_hids=[state['rank_n_approx']],
                       activation=[state['rank_n_activ']],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_t')

    emb_words_t = []
    if state['rec_gating']:
        gater_words_t = []
    if state['rec_reseting']:
        reseter_words_t = []
    for si in xrange(state['decoder_stack']):
        emb_words_t.append(
            MultiLayer(rng,
                       n_in=state['rank_n_approx'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='emb_words_t_%d' % si))
        if state['rec_gating']:
            gater_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='gater_words_t_%d' % si))
        if state['rec_reseting']:
            reseter_words_t.append(
                MultiLayer(rng,
                           n_in=state['rank_n_approx'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           learn_bias=False,
                           name='reseter_words_t_%d' % si))

    proj_everything_t = []
    if state['rec_gating']:
        gater_everything_t = []
    if state['rec_reseting']:
        reseter_everything_t = []
    for si in xrange(state['decoder_stack']):
        proj_everything_t.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[embdim],
                       activation=['lambda x:x'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       scale=state['weight_scale'],
                       name='proj_everything_t_%d' % si,
                       learn_bias=False))
        if state['rec_gating']:
            gater_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='gater_everything_t_%d' % si,
                           learn_bias=False))
        if state['rec_reseting']:
            reseter_everything_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=['lambda x:x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='reseter_everything_t_%d' % si,
                           learn_bias=False))

    add_rec_step_t = []
    rec_proj_t = []
    if state['rec_gating']:
        rec_proj_t_gater = []
    if state['rec_reseting']:
        rec_proj_t_reseter = []
    for si in xrange(state['decoder_stack']):
        if si > 0:
            rec_proj_t.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[embdim],
                           activation=['lambda x:x'],
                           init_fn=state['rec_weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['rec_weight_scale'],
                           name='rec_proj_%d' % si))
            if state['rec_gating']:
                rec_proj_t_gater.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_gater_%d' % si))
            if state['rec_reseting']:
                rec_proj_t_reseter.append(
                    MultiLayer(rng,
                               n_in=state['dim'],
                               n_hids=[state['dim']],
                               activation=['lambda x:x'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='rec_proj_t_reseter_%d' % si))

        add_rec_step_t.append(
            eval(state['rec_layer'])(rng,
                                     n_hids=state['dim'],
                                     activation=state['activ'],
                                     bias_scale=state['bias'],
                                     scale=state['rec_weight_scale'],
                                     init_fn=state['rec_weight_init_fn'],
                                     weight_noise=state['weight_noise_rec'],
                                     dropout=state['dropout_rec'],
                                     gating=state['rec_gating'],
                                     gater_activation=state['rec_gater'],
                                     reseting=state['rec_reseting'],
                                     reseter_activation=state['rec_reseter'],
                                     name='add_h_t_%d' % si))

    if state['encoder_stack'] > 1:
        encoder_proj = []
        for si in xrange(state['encoder_stack']):
            encoder_proj.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim'] * state['maxout_part']],
                           activation=['lambda x: x'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           scale=state['weight_scale'],
                           name='encoder_proj_%d' % si,
                           learn_bias=(si == 0)))

        encoder_act_layer = UnaryOp(activation=eval(state['unary_activ']),
                                    indim=indim,
                                    pieces=pieces,
                                    rng=rng)

    def _add_t_op(words_embeddings,
                  everything=None,
                  words_mask=None,
                  prev_val=None,
                  one_step=False,
                  bs=1,
                  init_state=None,
                  use_noise=True,
                  gater_below=None,
                  reseter_below=None,
                  si=0,
                  state_below=None):
        seqlen = words_embeddings.out.shape[0] // bs

        rval = words_embeddings
        gater = None
        if state['rec_gating']:
            gater = gater_below
        reseter = None
        if state['rec_reseting']:
            reseter = reseter_below
        if si > 0:
            if isinstance(state_below, list):
                state_below = state_below[-1]
            rval += rec_proj_t[si - 1](state_below,
                                       one_step=one_step,
                                       use_noise=use_noise)
            if state['rec_gating']:
                projg = rec_proj_t_gater[si - 1](state_below,
                                                 one_step=one_step,
                                                 use_noise=use_noise)
                if gater: gater += projg
                else: gater = projg
            if state['rec_reseting']:
                projg = rec_proj_t_reseter[si - 1](state_below,
                                                   one_step=one_step,
                                                   use_noise=use_noise)
                if reseter: reseter += projg
                else: reseter = projg
        if everything:
            rval = rval + proj_everything_t[si](everything)
            if state['rec_gating']:
                everyg = gater_everything_t[si](everything,
                                                one_step=one_step,
                                                use_noise=use_noise)
                if gater: gater += everyg
                else: gater = everyg
            if state['rec_reseting']:
                everyg = reseter_everything_t[si](everything,
                                                  one_step=one_step,
                                                  use_noise=use_noise)
                if reseter: reseter += everyg
                else: reseter = everyg

        if not one_step:
            rval = add_rec_step_t[si](rval,
                                      nsteps=seqlen,
                                      batch_size=bs,
                                      mask=words_mask,
                                      one_step=one_step,
                                      init_state=init_state,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        else:
            rval = add_rec_step_t[si](rval,
                                      mask=words_mask,
                                      state_before=prev_val,
                                      one_step=one_step,
                                      gater_below=gater,
                                      reseter_below=reseter,
                                      use_noise=use_noise)
        return rval

    add_t_op = Operator(_add_t_op)

    outdim = state['dim_mlp']
    if not state['deep_out']:
        outdim = state['rank_n_approx']

    if state['bias_code']:
        bias_code = []
        for si in xrange(state['decoder_stack']):
            bias_code.append(
                MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[state['dim']],
                           activation=[state['activ']],
                           bias_scale=[state['bias']],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           name='bias_code_%d' % si))

    if state['avg_word']:
        word_code_nin = state['rank_n_approx']
        word_code = MultiLayer(rng,
                               n_in=word_code_nin,
                               n_hids=[outdim],
                               activation='lambda x:x',
                               bias_scale=[state['bias_mlp'] / 3],
                               scale=state['weight_scale'],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               learn_bias=False,
                               name='word_code')

    proj_code = MultiLayer(rng,
                           n_in=state['dim'],
                           n_hids=[outdim],
                           activation='lambda x: x',
                           bias_scale=[state['bias_mlp'] / 3],
                           scale=state['weight_scale'],
                           init_fn=state['weight_init_fn'],
                           weight_noise=state['weight_noise'],
                           learn_bias=False,
                           name='proj_code')

    proj_h = []
    for si in xrange(state['decoder_stack']):
        proj_h.append(
            MultiLayer(rng,
                       n_in=state['dim'],
                       n_hids=[outdim],
                       activation='lambda x: x',
                       bias_scale=[state['bias_mlp'] / 3],
                       scale=state['weight_scale'],
                       init_fn=state['weight_init_fn'],
                       weight_noise=state['weight_noise'],
                       name='proj_h_%d' % si))

    if state['bigram']:
        proj_word = MultiLayer(rng,
                               n_in=state['rank_n_approx'],
                               n_hids=[outdim],
                               activation=['lambda x:x'],
                               bias_scale=[state['bias_mlp'] / 3],
                               init_fn=state['weight_init_fn'],
                               weight_noise=state['weight_noise'],
                               scale=state['weight_scale'],
                               learn_bias=False,
                               name='emb_words_lm')

    if state['deep_out']:
        indim = 0
        pieces = 0
        act_layer = UnaryOp(activation=eval(state['unary_activ']))
        drop_layer = DropOp(rng=rng, dropout=state['dropout'])

    if state['deep_out']:
        indim = state['dim_mlp'] / state['maxout_part']
        rank_n_approx = state['rank_n_approx']
        rank_n_activ = state['rank_n_activ']
    else:
        indim = state['rank_n_approx']
        rank_n_approx = 0
        rank_n_activ = None
    output_layer = SoftmaxLayer(rng,
                                indim,
                                state['nouts'],
                                state['weight_scale'],
                                -1,
                                rank_n_approx=rank_n_approx,
                                rank_n_activ=rank_n_activ,
                                weight_noise=state['weight_noise'],
                                init_fn=state['weight_init_fn'],
                                name='out')

    def _pop_op(everything,
                accum,
                everything_max=None,
                everything_min=None,
                word=None,
                aword=None,
                one_step=False,
                use_noise=True):

        rval = proj_h[0](accum[0], one_step=one_step, use_noise=use_noise)
        for si in xrange(1, state['decoder_stack']):
            rval += proj_h[si](accum[si],
                               one_step=one_step,
                               use_noise=use_noise)
        if state['mult_out']:
            rval = rval * everything
        else:
            rval = rval + everything

        if aword and state['avg_word']:
            wcode = aword
            if one_step:
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
            else:
                if not isinstance(wcode, TT.TensorVariable):
                    wcode = wcode.out
                shape = wcode.shape
                rshape = rval.shape
                rval = rval.reshape(
                    [rshape[0] / shape[0], shape[0], rshape[1]])
                wcode = wcode.dimshuffle('x', 0, 1)
                if state['mult_out']:
                    rval = rval * wcode
                else:
                    rval = rval + wcode
                rval = rval.reshape(rshape)
        if word and state['bigram']:
            if one_step:
                if state['mult_out']:
                    rval *= proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
                else:
                    rval += proj_word(emb_t(word, use_noise=use_noise),
                                      one_step=one_step,
                                      use_noise=use_noise)
            else:
                if isinstance(word, TT.TensorVariable):
                    shape = word.shape
                    ndim = word.ndim
                else:
                    shape = word.shape
                    ndim = word.out.ndim
                pword = proj_word(emb_t(word, use_noise=use_noise),
                                  one_step=one_step,
                                  use_noise=use_noise)
                shape_pword = pword.shape
                if ndim == 1:
                    pword = Shift()(pword.reshape([shape[0], 1, outdim]))
                else:
                    pword = Shift()(pword.reshape([shape[0], shape[1],
                                                   outdim]))
                if state['mult_out']:
                    rval *= pword.reshape(shape_pword)
                else:
                    rval += pword.reshape(shape_pword)
        if state['deep_out']:
            rval = drop_layer(act_layer(rval), use_noise=use_noise)
        return rval

    pop_op = Operator(_pop_op)

    # 3. Constructing the model
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x)),
               x_mask,
               bs=x_mask.shape[1],
               si=0,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]))
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x)),
                   x_mask,
                   bs=x_mask.shape[1],
                   si=si,
                   state_below=encoder_acts[-1],
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]))

    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = LastState(ntimes=True, n=y.shape[0])(encoder)
    else:
        everything = encoder_act_layer(everything)
        everything = everything.reshape(
            [1, everything.shape[0], everything.shape[1]])
        everything = LastState(ntimes=True, n=y.shape[0])(everything)

    if state['bias_code']:
        init_state = [bc(everything[-1]) for bc in bias_code]
    else:
        init_state = [None for bc in bias_code]

    if state['avg_word']:
        shape = x.shape
        pword = emb(x).out.reshape(
            [shape[0], shape[1], state['rank_n_approx']])
        pword = pword * x_mask.dimshuffle(0, 1, 'x')
        aword = pword.sum(0) / TT.maximum(1., x_mask.sum(0).dimshuffle(0, 'x'))
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words_t[0](emb_t(y0))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words_t[0](emb_t(y0))
    has_said = [
        add_t_op(emb_words_t[0](emb_t(y0)),
                 everything,
                 y_mask,
                 bs=y_mask.shape[1],
                 gater_below=gater_below,
                 reseter_below=reseter_below,
                 init_state=init_state[0],
                 si=0)
    ]
    for si in xrange(1, state['decoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[si](emb_t(y0))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[si](emb_t(y0))
        has_said.append(
            add_t_op(emb_words_t[si](emb_t(y0)),
                     everything,
                     y_mask,
                     bs=y_mask.shape[1],
                     state_below=has_said[-1],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     init_state=init_state[si],
                     si=si))

    if has_said[0].out.ndim < 3:
        for si in xrange(state['decoder_stack']):
            shape_hs = has_said[si].shape
            if y0.ndim == 1:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], 1, state['dim_mlp']]))
            else:
                shape = y0.shape
                has_said[si] = Shift()(has_said[si].reshape(
                    [shape[0], shape[1], state['dim_mlp']]))
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])
            has_said[si] = has_said[si].reshape(shape_hs)
    else:
        for si in xrange(state['decoder_stack']):
            has_said[si] = Shift()(has_said[si])
            has_said[si].out = TT.set_subtensor(has_said[si].out[0, :, :],
                                                init_state[si])

    model = pop_op(proj_code(everything), has_said, word=y0, aword=aword)

    nll = output_layer.train(
        state_below=model, target=y0, mask=y_mask, reg=None) / TT.cast(
            y.shape[0] * y.shape[1], 'float32')

    valid_fn = None
    noise_fn = None

    x = TT.lvector(name='x')
    n_steps = TT.iscalar('nsteps')
    temp = TT.scalar('temp')
    gater_below = None
    if state['rec_gating']:
        gater_below = gater_words[0](emb(x))
    reseter_below = None
    if state['rec_reseting']:
        reseter_below = reseter_words[0](emb(x))
    encoder_acts = [
        add_op(emb_words[0](emb(x), use_noise=False),
               si=0,
               use_noise=False,
               gater_below=gater_below,
               reseter_below=reseter_below)
    ]
    if state['encoder_stack'] > 1:
        everything = encoder_proj[0](last(encoder_acts[-1]), use_noise=False)
    for si in xrange(1, state['encoder_stack']):
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words[si](emb(x))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words[si](emb(x))
        encoder_acts.append(
            add_op(emb_words[si](emb(x), use_noise=False),
                   si=si,
                   state_below=encoder_acts[-1],
                   use_noise=False,
                   gater_below=gater_below,
                   reseter_below=reseter_below))
        if state['encoder_stack'] > 1:
            everything += encoder_proj[si](last(encoder_acts[-1]),
                                           use_noise=False)
    if state['encoder_stack'] <= 1:
        encoder = encoder_acts[-1]
        everything = last(encoder)
    else:
        everything = encoder_act_layer(everything)

    init_state = []
    for si in xrange(state['decoder_stack']):
        if state['bias_code']:
            init_state.append(
                TT.reshape(bias_code[si](everything, use_noise=False),
                           [1, state['dim']]))
        else:
            init_state.append(TT.alloc(numpy.float32(0), 1, state['dim']))

    if state['avg_word']:
        aword = emb(x, use_noise=False).out.mean(0)
        aword = word_code(aword, use_noise=False)
    else:
        aword = None

    def sample_fn(*args):
        aidx = 0
        word_tm1 = args[aidx]
        aidx += 1
        prob_tm1 = args[aidx]
        has_said_tm1 = []
        for si in xrange(state['decoder_stack']):
            aidx += 1
            has_said_tm1.append(args[aidx])
        aidx += 1
        ctx = args[aidx]
        if state['avg_word']:
            aidx += 1
            awrd = args[aidx]

        val = pop_op(proj_code(ctx),
                     has_said_tm1,
                     word=word_tm1,
                     aword=awrd,
                     one_step=True,
                     use_noise=False)
        sample = output_layer.get_sample(state_below=val, temp=temp)
        logp = output_layer.get_cost(state_below=val.out.reshape(
            [1, TT.cast(output_layer.n_in, 'int64')]),
                                     temp=temp,
                                     target=sample.reshape([1, 1]),
                                     use_noise=False)
        gater_below = None
        if state['rec_gating']:
            gater_below = gater_words_t[0](emb_t(sample))
        reseter_below = None
        if state['rec_reseting']:
            reseter_below = reseter_words_t[0](emb_t(sample))
        has_said_t = [
            add_t_op(emb_words_t[0](emb_t(sample)),
                     ctx,
                     prev_val=has_said_tm1[0],
                     gater_below=gater_below,
                     reseter_below=reseter_below,
                     one_step=True,
                     use_noise=True,
                     si=0)
        ]
        for si in xrange(1, state['decoder_stack']):
            gater_below = None
            if state['rec_gating']:
                gater_below = gater_words_t[si](emb_t(sample))
            reseter_below = None
            if state['rec_reseting']:
                reseter_below = reseter_words_t[si](emb_t(sample))
            has_said_t.append(
                add_t_op(emb_words_t[si](emb_t(sample)),
                         ctx,
                         prev_val=has_said_tm1[si],
                         gater_below=gater_below,
                         reseter_below=reseter_below,
                         one_step=True,
                         use_noise=True,
                         si=si,
                         state_below=has_said_t[-1]))
        for si in xrange(state['decoder_stack']):
            if isinstance(has_said_t[si], list):
                has_said_t[si] = has_said_t[si][-1]
        rval = [sample, TT.cast(logp, 'float32')] + has_said_t
        return rval

    sampler_params = [everything]
    if state['avg_word']:
        sampler_params.append(aword)

    states = [TT.alloc(numpy.int64(0), n_steps)]
    states.append(TT.alloc(numpy.float32(0), n_steps))
    states += init_state

    outputs, updates = scan(sample_fn,
                            states=states,
                            params=sampler_params,
                            n_steps=n_steps,
                            name='sampler_scan')
    samples = outputs[0]
    probs = outputs[1]

    sample_fn = theano.function([n_steps, temp, x],
                                [samples, probs.sum()],
                                updates=updates,
                                profile=False,
                                name='sample_fn')

    model = LM_Model(cost_layer=nll,
                     weight_noise_amount=state['weight_noise_amount'],
                     valid_fn=valid_fn,
                     sample_fn=sample_fn,
                     clean_before_noise_fn=False,
                     noise_fn=noise_fn,
                     indx_word=state['indx_word_target'],
                     indx_word_src=state['indx_word'],
                     character_level=False,
                     rng=rng)

    if state['loopIters'] > 0: algo = SGD(model, state, train_data)
    else: algo = None

    def hook_fn():
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs
        old_offset = train_data.offset
        if state['sample_reset']: train_data.reset()
        ns = 0
        for sidx in xrange(state['sample_n']):
            while True:
                batch = train_data.next()
                if batch:
                    break
            x = batch['x']
            y = batch['y']
            #xbow = batch['x_bow']
            masks = batch['x_mask']
            if x.ndim > 1:
                for idx in xrange(x.shape[1]):
                    ns += 1
                    if ns > state['sample_max']:
                        break
                    print 'Input: ',
                    for k in xrange(x[:, idx].shape[0]):
                        print model.word_indxs_src[x[:, idx][k]],
                        if model.word_indxs_src[x[:, idx][k]] == '<eol>':
                            break
                    print ''
                    print 'Target: ',
                    for k in xrange(y[:, idx].shape[0]):
                        print model.word_indxs[y[:, idx][k]],
                        if model.word_indxs[y[:, idx][k]] == '<eol>':
                            break
                    print ''
                    senlen = len(x[:, idx])
                    if len(numpy.where(masks[:, idx] == 0)[0]) > 0:
                        senlen = numpy.where(masks[:, idx] == 0)[0][0]
                    if senlen < 1:
                        continue
                    xx = x[:senlen, idx]
                    #xx = xx.reshape([xx.shape[0], 1])
                    model.get_samples(state['seqlen'] + 1, 1, xx)
            else:
                ns += 1
                model.get_samples(state['seqlen'] + 1, 1, x)
            if ns > state['sample_max']:
                break
        train_data.offset = old_offset
        return

    main = MainLoop(train_data,
                    valid_data,
                    None,
                    model,
                    algo,
                    state,
                    channel,
                    reset=state['reset'],
                    hooks=hook_fn)
    if state['reload']: main.load()
    if state['loopIters'] > 0: main.main()

    if state['sampler_test']:
        # This is a test script: we only sample
        if not hasattr(model, 'word_indxs'): model.load_dict()
        if not hasattr(model, 'word_indxs_src'):
            model.word_indxs_src = model.word_indxs

        indx_word = pkl.load(open(state['word_indx'], 'rb'))

        try:
            while True:
                try:
                    seqin = raw_input('Input Sequence: ')
                    n_samples = int(raw_input('How many samples? '))
                    alpha = float(raw_input('Inverse Temperature? '))

                    seqin = seqin.lower()
                    seqin = seqin.split()

                    seqlen = len(seqin)
                    seq = numpy.zeros(seqlen + 1, dtype='int64')
                    for idx, sx in enumerate(seqin):
                        try:
                            seq[idx] = indx_word[sx]
                        except:
                            seq[idx] = indx_word[state['oov']]
                    seq[-1] = state['null_sym_source']

                except Exception:
                    print 'Something wrong with your input! Try again!'
                    continue

                sentences = []
                all_probs = []
                for sidx in xrange(n_samples):
                    #import ipdb; ipdb.set_trace()
                    [values, probs] = model.sample_fn(seqlen * 3, alpha, seq)
                    sen = []
                    for k in xrange(values.shape[0]):
                        if model.word_indxs[values[k]] == '<eol>':
                            break
                        sen.append(model.word_indxs[values[k]])
                    sentences.append(" ".join(sen))
                    all_probs.append(-probs)
                sprobs = numpy.argsort(all_probs)
                for pidx in sprobs:
                    print pidx, "(%f):" % (-all_probs[pidx]), sentences[pidx]
                print

        except KeyboardInterrupt:
            print 'Interrupted'
            pass
Example #45
0
    def __init__(self,
                 n_in,
                 hidden_layer_size,
                 n_out,
                 L1_reg,
                 L2_reg,
                 hidden_layer_type,
                 output_type='LINEAR',
                 network_type='DNN',
                 dropout_rate=0.0):
        """ This function initialises a neural network

        :param n_in: Dimensionality of input features
        :type in: Integer
        :param hidden_layer_size: The layer size for each hidden layer
        :type hidden_layer_size: A list of integers
        :param n_out: Dimensionality of output features
        :type n_out: Integrer
        :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM
        :param L1_reg: the L1 regulasation weight
        :param L2_reg: the L2 regulasation weight
        :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression.
        :param dropout_rate: probability of dropout, a float number between 0 and 1.
        """

        logger = logging.getLogger("DNN initialization")

        self.n_in = int(n_in)
        self.n_out = int(n_out)

        self.n_layers = len(hidden_layer_size)

        self.dropout_rate = dropout_rate
        self.is_train = T.iscalar('is_train')

        assert len(hidden_layer_size) == len(hidden_layer_type)

        self.x = T.matrix('x')
        self.y = T.matrix('y')

        if network_type == "S2S":
            self.d = T.ivector('d')
            self.f = T.matrix('f')

        self.L1_reg = L1_reg
        self.L2_reg = L2_reg

        self.rnn_layers = []
        self.params = []
        self.delta_params = []

        rng = np.random.RandomState(123)

        BLSTM_variants = ['BLSTM', 'BSLSTM', 'BLSTME']
        Encoder_variants = ['RNNE', 'LSTME', 'BLSTME', 'SLSTME']
        for i in range(self.n_layers):
            if i == 0:
                input_size = n_in
            else:
                input_size = hidden_layer_size[i - 1]
                if hidden_layer_type[i - 1] in BLSTM_variants:
                    input_size = hidden_layer_size[i - 1] * 2

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.rnn_layers[i - 1].output

            ### sequence-to-sequence mapping ###
            if hidden_layer_type[i - 1] in Encoder_variants:
                dur_input = self.d
                frame_feat_input = self.f

                if network_type == "S2S":
                    seq2seq_model = DistributedSequenceEncoder(
                        rng, layer_input, dur_input)
                    layer_input = T.concatenate(
                        (seq2seq_model.encoded_output, frame_feat_input),
                        axis=1)
                    input_size = input_size + 4

                else:
                    logger.critical(
                        "This network type: %s is not supported right now! \n Please use one of the following: DNN, RNN, S2S\n"
                        % (network_type))
                    sys.exit(1)

            if hidden_layer_type[i] == 'SLSTM':
                hidden_layer = SimplifiedLstm(rng,
                                              layer_input,
                                              input_size,
                                              hidden_layer_size[i],
                                              p=self.dropout_rate,
                                              training=self.is_train)
            elif hidden_layer_type[i] == 'SGRU':
                hidden_layer = SimplifiedGRU(rng,
                                             layer_input,
                                             input_size,
                                             hidden_layer_size[i],
                                             p=self.dropout_rate,
                                             training=self.is_train)
            elif hidden_layer_type[i] == 'GRU':
                hidden_layer = GatedRecurrentUnit(rng,
                                                  layer_input,
                                                  input_size,
                                                  hidden_layer_size[i],
                                                  p=self.dropout_rate,
                                                  training=self.is_train)
            elif hidden_layer_type[i] == 'LSTM_NFG':
                hidden_layer = LstmNFG(rng,
                                       layer_input,
                                       input_size,
                                       hidden_layer_size[i],
                                       p=self.dropout_rate,
                                       training=self.is_train)
            elif hidden_layer_type[i] == 'LSTM_NOG':
                hidden_layer = LstmNOG(rng,
                                       layer_input,
                                       input_size,
                                       hidden_layer_size[i],
                                       p=self.dropout_rate,
                                       training=self.is_train)
            elif hidden_layer_type[i] == 'LSTM_NIG':
                hidden_layer = LstmNIG(rng,
                                       layer_input,
                                       input_size,
                                       hidden_layer_size[i],
                                       p=self.dropout_rate,
                                       training=self.is_train)
            elif hidden_layer_type[i] == 'LSTM_NPH':
                hidden_layer = LstmNoPeepholes(rng,
                                               layer_input,
                                               input_size,
                                               hidden_layer_size[i],
                                               p=self.dropout_rate,
                                               training=self.is_train)
            elif hidden_layer_type[i] == 'LSTM':
                hidden_layer = VanillaLstm(rng,
                                           layer_input,
                                           input_size,
                                           hidden_layer_size[i],
                                           p=self.dropout_rate,
                                           training=self.is_train)
            elif hidden_layer_type[i] == 'LSTME':
                hidden_layer = VanillaLstm(rng,
                                           layer_input,
                                           input_size,
                                           hidden_layer_size[i],
                                           p=self.dropout_rate,
                                           training=self.is_train)
            elif hidden_layer_type[i] == 'CLSTM':
                hidden_layer = ContextLstm(rng,
                                           layer_input,
                                           input_size,
                                           hidden_layer_size[i],
                                           p=self.dropout_rate,
                                           training=self.is_train)
            elif hidden_layer_type[i] == 'LSTMD':
                hidden_layer = VanillaLstmDecoder(rng,
                                                  layer_input,
                                                  input_size,
                                                  hidden_layer_size[i],
                                                  self.n_out,
                                                  p=self.dropout_rate,
                                                  training=self.is_train)
            elif hidden_layer_type[i] == 'BSLSTM':
                hidden_layer = BidirectionSLstm(rng,
                                                layer_input,
                                                input_size,
                                                hidden_layer_size[i],
                                                hidden_layer_size[i],
                                                p=self.dropout_rate,
                                                training=self.is_train)
            elif hidden_layer_type[i] == 'BLSTM':
                hidden_layer = BidirectionLstm(rng,
                                               layer_input,
                                               input_size,
                                               hidden_layer_size[i],
                                               hidden_layer_size[i],
                                               p=self.dropout_rate,
                                               training=self.is_train)
            elif hidden_layer_type[i] == 'BLSTME':
                hidden_layer = BidirectionLstm(rng,
                                               layer_input,
                                               input_size,
                                               hidden_layer_size[i],
                                               hidden_layer_size[i],
                                               p=self.dropout_rate,
                                               training=self.is_train)
            elif hidden_layer_type[i] == 'RNN':
                hidden_layer = VanillaRNN(rng,
                                          layer_input,
                                          input_size,
                                          hidden_layer_size[i],
                                          p=self.dropout_rate,
                                          training=self.is_train)
            elif hidden_layer_type[i] == 'RNNE':
                hidden_layer = VanillaRNN(rng,
                                          layer_input,
                                          input_size,
                                          hidden_layer_size[i],
                                          p=self.dropout_rate,
                                          training=self.is_train)
            elif hidden_layer_type[i] == 'RNND':
                hidden_layer = VanillaRNNDecoder(rng,
                                                 layer_input,
                                                 input_size,
                                                 hidden_layer_size[i],
                                                 self.n_out,
                                                 p=self.dropout_rate,
                                                 training=self.is_train)
            elif hidden_layer_type[i] == 'TANH':
                hidden_layer = SigmoidLayer(rng,
                                            layer_input,
                                            input_size,
                                            hidden_layer_size[i],
                                            activation=T.tanh,
                                            p=self.dropout_rate,
                                            training=self.is_train)
            elif hidden_layer_type[i] == 'SIGMOID':
                hidden_layer = SigmoidLayer(rng,
                                            layer_input,
                                            input_size,
                                            hidden_layer_size[i],
                                            activation=T.nnet.sigmoid,
                                            p=self.dropout_rate,
                                            training=self.is_train)
            else:
                logger.critical(
                    "This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n"
                    % (hidden_layer_type[i]))
                sys.exit(1)

            self.rnn_layers.append(hidden_layer)
            self.params.extend(hidden_layer.params)

        input_size = hidden_layer_size[-1]
        if hidden_layer_type[-1] == 'BSLSTM' or hidden_layer_type[
                -1] == 'BLSTM':
            input_size = hidden_layer_size[-1] * 2

        if hidden_layer_type[-1] == "RNND" or hidden_layer_type[-1] == "LSTMD":
            self.final_layer = self.rnn_layers[-1]
        else:
            if output_type == 'LINEAR':
                self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output,
                                               input_size, self.n_out)
            elif output_type == 'SIGMOID':
                self.final_layer = SigmoidLayer(rng,
                                                self.rnn_layers[-1].output,
                                                input_size,
                                                self.n_out,
                                                activation=T.nnet.sigmoid)
            else:
                logger.critical(
                    "This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, SIGMOID\n"
                    % (output_type))
                sys.exit(1)

            self.params.extend(self.final_layer.params)

        self.updates = {}
        for param in self.params:
            self.updates[param] = theano.shared(
                value=np.zeros(param.get_value(borrow=True).shape,
                               dtype=theano.config.floatX),
                name='updates')

        self.finetune_cost = T.mean(
            T.sum((self.final_layer.output - self.y)**2, axis=1))
        self.errors = T.mean(
            T.sum((self.final_layer.output - self.y)**2, axis=1))
Example #46
0
import theano
import theano.tensor as T
import numpy as np
theano.config.warn.subtensor_merge_bug = False

i = T.iscalar("i")
x = T.iscalar("x")
y = T.iscalar("y")
A = T.imatrix("A")


def inner_sum(prior_x, B):
    return prior_x + B


def inner_sum2D(x_t, y_t, u):
    return x_t + y_t + u


row_count = 3
column_count = 4

# Symbolic description of the result
result, updates = theano.scan(
    fn=inner_sum2D,
    sequences=dict(input=T.flatten(A), taps=[column_count]),
    outputs_info=dict(initial=T.flatten(A), taps=[-1, -column_count]),
    #non_sequences=
    n_steps=x * y)

# Scan has provided us with A ** 1 through A ** k.  Keep only the last
Example #47
0
    def __init__(self, We_initial, char_embedd_table_initial, params):
        self.textfile = open(params.outfile, 'w')
        We = theano.shared(We_initial)
        We_inf = theano.shared(We_initial)

        embsize = We_initial.shape[1]
        hidden = params.hidden
        self.en_hidden_size = params.hidden_inf
        self.num_labels = params.num_labels
        self.de_hidden_size = params.de_hidden_size

        self.lstm_layers_num = 1

        char_embedd_dim = params.char_embedd_dim
        char_dic_size = len(params.char_dic)
        char_embedd_table = theano.shared(char_embedd_table_initial)
        char_embedd_table_inf = theano.shared(char_embedd_table_initial)

        input_var = T.imatrix(name='inputs')
        target_var = T.imatrix(name='targets')
        target_var_in = T.imatrix(name='in_targets')
        mask_var = T.fmatrix(name='masks')
        mask_var1 = T.fmatrix(name='masks1')
        length = T.iscalar()
        length0 = T.iscalar()
        t_t = T.fscalar()
        t_t0 = T.fscalar()
        char_input_var = T.itensor3(name='char-inputs')

        use_dropout = T.fscalar()
        use_dropout0 = T.fscalar()

        Wyy0 = np.random.uniform(
            -0.02, 0.02,
            (self.num_labels + 1, self.num_labels + 1)).astype('float32')
        Wyy = theano.shared(Wyy0)

        l_in_word = lasagne.layers.InputLayer((None, None))
        l_mask_word = lasagne.layers.InputLayer(shape=(None, None))

        if params.emb == 1:
            l_emb_word = lasagne.layers.EmbeddingLayer(
                l_in_word,
                input_size=We_initial.shape[0],
                output_size=embsize,
                W=We)
        else:
            l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We)

        layer_char_input = lasagne.layers.InputLayer(shape=(None, None,
                                                            Max_Char_Length),
                                                     input_var=char_input_var,
                                                     name='char-input')

        layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2]))
        layer_char_embedding = lasagne.layers.EmbeddingLayer(
            layer_char,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table,
            name='char_embedding')

        layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding,
                                                    pattern=(0, 2, 1))

        # first get some necessary dimensions or parameters
        conv_window = 3
        num_filters = params.num_filters

        # construct convolution layer
        cnn_layer = lasagne.layers.Conv1DLayer(
            layer_char,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn')
        # infer the pool size for pooling (pool size should go through all time step of cnn)
        _, _, pool_size = cnn_layer.output_shape

        # construct max pool layer
        pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer,
                                                   pool_size=pool_size)
        # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters]
        output_cnn_layer = lasagne.layers.reshape(pool_layer,
                                                  (-1, length, [1]))

        # finally, concatenate the two incoming layers together.
        l_emb_word = lasagne.layers.concat([output_cnn_layer, l_emb_word],
                                           axis=2)

        l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word)
        l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word,
                                                hidden,
                                                mask_input=l_mask_word,
                                                backwards=True)

        concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2)

        l_reshape_concat = lasagne.layers.ReshapeLayer(concat,
                                                       (-1, 2 * hidden))

        l_local = lasagne.layers.DenseLayer(
            l_reshape_concat,
            num_units=self.num_labels,
            nonlinearity=lasagne.nonlinearities.linear)

        network_params = lasagne.layers.get_all_params(l_local, trainable=True)
        network_params.append(Wyy)

        print len(network_params)
        f = open(
            'ccctag_BiLSTM_CNN_CRF_num_filters_30_dropout_1_LearningRate_0.01_0.0_400_emb_1_tagversoin_2.pickle',
            'r')
        data = pickle.load(f)
        f.close()

        for idx, p in enumerate(network_params):

            p.set_value(data[idx])

        self.params = []
        self.hos = []
        self.Cos = []
        self.encoder_lstm_layers = []
        self.decoder_lstm_layers = []

        ei, di, dt = T.imatrices(3)  #place holders
        decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6)
        ci = T.itensor3()

        #### the last one is for the stary symbole
        self.de_lookuptable = theano.shared(name="Decoder LookUpTable",
                                            value=init_xavier_uniform(
                                                self.num_labels + 1,
                                                self.de_hidden_size),
                                            borrow=True)

        self.linear = theano.shared(
            name="Linear",
            value=init_xavier_uniform(
                self.de_hidden_size + 2 * self.en_hidden_size,
                self.num_labels),
            borrow=True)
        self.linear_bias = theano.shared(
            name="Hidden to Bias",
            value=np.asarray(np.random.randn(self.num_labels, ) * 0.,
                             dtype=theano.config.floatX),
            borrow=True)
        #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True)

        #self.hidden_bias = theano.shared(
        #        name="Hidden to Bias",
        #        value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) ,
        #        borrow=True
        #        )

        input_var_shuffle = input_var.dimshuffle(1, 0)
        mask_var_shuffle = mask_var.dimshuffle(1, 0)
        target_var_in_shuffle = target_var_in.dimshuffle(1, 0)
        target_var_shuffle = target_var.dimshuffle(1, 0)

        self.params += [
            We_inf, self.linear, self.linear_bias, self.de_lookuptable
        ]  #concatenate
        state_below = We_inf[input_var_shuffle.flatten()].reshape(
            (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize))

        ###### character word embedding
        layer_char_input_inf = lasagne.layers.InputLayer(
            shape=(None, None, Max_Char_Length),
            input_var=char_input_var,
            name='char-input')
        layer_char_inf = lasagne.layers.reshape(layer_char_input_inf,
                                                (-1, [2]))
        layer_char_embedding_inf = lasagne.layers.EmbeddingLayer(
            layer_char_inf,
            input_size=char_dic_size,
            output_size=char_embedd_dim,
            W=char_embedd_table_inf,
            name='char_embedding_inf')

        layer_char_inf = lasagne.layers.DimshuffleLayer(
            layer_char_embedding_inf, pattern=(0, 2, 1))
        #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5)

        cnn_layer_inf = lasagne.layers.Conv1DLayer(
            layer_char_inf,
            num_filters=num_filters,
            filter_size=conv_window,
            pad='full',
            nonlinearity=lasagne.nonlinearities.tanh,
            name='cnn_inf')

        pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf,
                                                       pool_size=pool_size)
        output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf,
                                                      (-1, length, [1]))
        char_params = lasagne.layers.get_all_params(output_cnn_layer_inf,
                                                    trainable=True)
        self.params += char_params

        ###### [batch, sent_length, num_filters]
        #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var})
        char_state_below = lasagne.layers.get_output(output_cnn_layer_inf)

        char_state_below = dropout_layer(char_state_below, use_dropout, trng)

        char_state_shuff = char_state_below.dimshuffle(1, 0, 2)
        state_below = T.concatenate([state_below, char_state_shuff], axis=2)

        state_below = dropout_layer(state_below, use_dropout, trng)

        enclstm_f = LSTM(embsize + num_filters, self.en_hidden_size)
        enclstm_b = LSTM(embsize + num_filters, self.en_hidden_size, True)
        self.encoder_lstm_layers.append(enclstm_f)  #append
        self.encoder_lstm_layers.append(enclstm_b)  #append
        self.params += enclstm_f.params + enclstm_b.params  #concatenate

        hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle)
        hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle)

        hs = T.concatenate([hs_f, hs_b], axis=2)
        Cs = T.concatenate([Cs_f, Cs_b], axis=2)

        #hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1)
        #Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1)
        #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias),
        #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias),
        self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX),
                            input_var_shuffle.shape[1], self.de_hidden_size),
        self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX),
                            input_var_shuffle.shape[1], self.de_hidden_size),

        Encoder = hs

        state_below = self.de_lookuptable[
            target_var_in_shuffle.flatten()].reshape(
                (target_var_in_shuffle.shape[0],
                 target_var_in_shuffle.shape[1], self.de_hidden_size))
        for i in range(self.lstm_layers_num):
            declstm = LSTM(self.de_hidden_size, self.de_hidden_size)
            self.decoder_lstm_layers += declstm,  #append
            self.params += declstm.params  #concatenate
            ho, Co = self.hos[i], self.Cos[i]
            state_below, Cs = declstm.forward(state_below, mask_var_shuffle,
                                              ho, Co)

        decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2)
        linear_outputs = T.dot(decoder_lstm_outputs,
                               self.linear) + self.linear_bias[None, None, :]
        softmax_outputs, updates = theano.scan(
            fn=lambda x: T.nnet.softmax(x),
            sequences=[linear_outputs],
        )

        def _NLL(pred, y, m):
            return -m * T.log(pred[T.arange(input_var.shape[0]), y])

        def _step2(ctx_, state_, hs_, Cs_):

            hs, Cs = [], []
            token_idxs = T.cast(state_.argmax(axis=-1), "int32")
            msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1.)
            msk_ = msk_.dimshuffle('x', 0)
            state_below0 = self.de_lookuptable[token_idxs].reshape(
                (1, input_var_shuffle.shape[1], self.de_hidden_size))
            for i, lstm in enumerate(self.decoder_lstm_layers):
                h, C = lstm.forward(state_below0, msk_, hs_[i],
                                    Cs_[i])  #mind msk
                hs += h[-1],
                Cs += C[-1],
                state_below0 = h

            hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs)
            state_below0 = state_below0.reshape(
                (input_var.shape[0], self.de_hidden_size))
            state_below0 = T.concatenate([ctx_, state_below0], axis=1)

            newpred = T.dot(state_below0, self.linear).reshape(
                (input_var_shuffle.shape[1],
                 self.num_labels)) + self.linear_bias[None, :]
            state_below = T.nnet.softmax(newpred)

            extra_p = T.zeros_like(hs[:, :, 0])
            state_below = T.concatenate([state_below, extra_p.T], axis=1)

            return state_below, hs, Cs

        hs0, Cs0 = T.as_tensor_variable(
            self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0")
        train_outputs, _ = theano.scan(fn=_step2,
                                       sequences=[Encoder],
                                       outputs_info=[decoderInputs0, hs0, Cs0],
                                       n_steps=input_var_shuffle.shape[0])

        predy = train_outputs[0].dimshuffle(1, 0, 2)
        predy = predy[:, :, :-1] * mask_var[:, :, None]
        predy0 = predy.reshape((-1, self.num_labels))

        def inner_function(targets_one_step, mask_one_step, prev_label,
                           tg_energy):
            """
                        :param targets_one_step: [batch_size, t]
                        :param prev_label: [batch_size, t]
                        :param tg_energy: [batch_size]
                        :return:
                        """
            new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1])
            new_ta_energy_t = tg_energy + T.sum(
                new_ta_energy * targets_one_step, axis=1)
            tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy)

            return [targets_one_step, tg_energy_t]

        local_energy = lasagne.layers.get_output(
            l_local, {
                l_in_word: input_var,
                l_mask_word: mask_var,
                layer_char_input: char_input_var
            })
        local_energy = local_energy.reshape((-1, length, self.num_labels))
        local_energy = local_energy * mask_var[:, :, None]

        #####################
        # for the end symbole of a sequence
        ####################

        end_term = Wyy[:-1, -1]
        local_energy = local_energy + end_term.dimshuffle(
            'x', 'x', 0) * mask_var1[:, :, None]

        #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var})

        predy_in = T.argmax(predy0, axis=1)
        A = T.extra_ops.to_one_hot(predy_in, self.num_labels)
        A = A.reshape((-1, length, self.num_labels))

        #predy = predy0.reshape((-1, length, 25))
        #predy = predy*mask_var[:,:,None]

        targets_shuffled = predy.dimshuffle(1, 0, 2)
        target_time0 = targets_shuffled[0]

        masks_shuffled = mask_var.dimshuffle(1, 0)

        initial_energy0 = T.dot(target_time0, Wyy[-1, :-1])

        initials = [target_time0, initial_energy0]
        [_, target_energies], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials,
            sequences=[targets_shuffled[1:], masks_shuffled[1:]])
        cost11 = target_energies[-1] + T.sum(
            T.sum(local_energy * predy, axis=2) * mask_var, axis=1)

        # compute the ground-truth energy

        targets_shuffled0 = A.dimshuffle(1, 0, 2)
        target_time00 = targets_shuffled0[0]

        initial_energy00 = T.dot(target_time00, Wyy[-1, :-1])

        initials0 = [target_time00, initial_energy00]
        [_, target_energies0], _ = theano.scan(
            fn=inner_function,
            outputs_info=initials0,
            sequences=[targets_shuffled0[1:], masks_shuffled[1:]])
        cost110 = target_energies0[-1] + T.sum(
            T.sum(local_energy * A, axis=2) * mask_var, axis=1)

        #predy_f =  predy.reshape((-1, 25))
        y_f = target_var.flatten()

        if (params.annealing == 0):
            lamb = params.L3
        elif (params.annealing == 1):
            lamb = params.L3 * (1 - 0.01 * t_t)

        if (params.regutype == 0):
            ce_hinge = lasagne.objectives.categorical_crossentropy(
                predy0 + eps, y_f)
            ce_hinge = ce_hinge.reshape((-1, length))
            ce_hinge = T.sum(ce_hinge * mask_var, axis=1)
            cost = T.mean(-cost11) + lamb * T.mean(ce_hinge)
        else:

            entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1)
            entropy_term = entropy_term.reshape((-1, length))
            entropy_term = T.sum(entropy_term * mask_var, axis=1)
            cost = T.mean(-cost11) - lamb * T.mean(entropy_term)
        """
		f = open('F0_simple.pickle')
                PARA = pickle.load(f)
                f.close()
                l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params))


                cost = T.mean(-cost11) + params.L2*l2_term
		"""

        ##from adam import adam
        ##updates_a = adam(cost, self.params, params.eta)
        #updates_a = lasagne.updates.sgd(cost, self.params, params.eta)
        #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9)
        from momentum import momentum
        updates_a = momentum(cost, self.params, params.eta, momentum=0.9)

        if (params.regutype == 0):
            self.train_fn = theano.function(
                inputs=[ei, ci, dt, em, em1, length0, t_t0, di0, use_dropout0],
                outputs=[cost, ce_hinge],
                updates=updates_a,
                on_unused_input='ignore',
                givens={
                    input_var: ei,
                    char_input_var: ci,
                    target_var: dt,
                    mask_var: em,
                    mask_var1: em1,
                    length: length0,
                    t_t: t_t0,
                    decoderInputs0: di0,
                    use_dropout: use_dropout0
                })
            #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore')
        else:

            self.train_fn = theano.function(
                inputs=[ei, dt, em, em1, length0, t_t0, di0, use_dropout0],
                outputs=[cost, entropy_term],
                updates=updates_a,
                on_unused_input='ignore',
                givens={
                    input_var: ei,
                    char_input_var: ci,
                    target_var: dt,
                    mask_var: em,
                    mask_var1: em1,
                    length: length0,
                    t_t: t_t0,
                    decoderInputs0: di0,
                    use_dropout: use_dropout0
                })
            #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore')

        prediction = T.argmax(predy, axis=2)
        corr = T.eq(prediction, target_var)
        corr_train = (corr * mask_var).sum(dtype=theano.config.floatX)
        num_tokens = mask_var.sum(dtype=theano.config.floatX)

        self.eval_fn = theano.function(
            inputs=[ei, ci, dt, em, em1, length0, di0, use_dropout0],
            outputs=[cost11, cost110, corr_train, num_tokens, prediction],
            on_unused_input='ignore',
            givens={
                input_var: ei,
                char_input_var: ci,
                target_var: dt,
                mask_var: em,
                mask_var1: em1,
                length: length0,
                decoderInputs0: di0,
                use_dropout: use_dropout0
            })
Example #48
0
# at work in In(y, value=1). In the case of In(w, value=2, name='w_by_name').
# We override the symbolic variable's name attribute with a name to be used
# for this function.

# 4. Using Shared Variables

# It is also possible to make a function with an internal state. For
# example, let's say we want to make an accumulator: at the beginning,
# the initialized to zero. Then, on each fucntion call, the state is
# incremented by the function's arguments.

# First let's define teh accumulator function. It adds its argument to the
# interanl state, and returns teh old state value.
from theano import shared
state = shared(0)
inc = T.iscalar('inc')
accumulator = function([inc], state, updates=[(state, state + inc)])
# This code introduces a few concepts. The shared function constructs so-
# called shared variables. These are hybrid symbolic and non-symbolic
# variables whose value may be shared between multiple functions. Shared
# variables can be used in symbolic expressions just like the objects
# returned by dmatrices(...) but they also have an internal value that
# defines the value taken by this symbolic variable in all the functions
# that use it. It is called a shared variable because its value is shared
# between many .set_value() methods. We will com back to this soon.

# The other new thing in this code is the updates parameter of function.
# updates must be supplied with a list of pairs of the form (shared-
# variable, new expression). It can also be a dictionary whose keys are
# shared-variables and values are the new expression. Either way, it means
# "whenever this function runs, it will replace the .value of each shared
    def __init__(self, nh, nc, ne, de, cs, em, init, featdim):
        '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size 
        '''
        # parameters of the model

        self.featdim = featdim

        tmp_emb = 0.2 * numpy.random.uniform(-1.0, 1.0, (ne + 1, de))
        if init:
            for row in xrange(ne + 1):
                if em[row] is not None:
                    tmp_emb[row] = em[row]

        self.emb = theano.shared(tmp_emb.astype(
            theano.config.floatX))  # add one for PADDING at the end

        # weights for LSTM
        n_in = de * cs
        print "de,cs", de, cs
        # print  "n_i",n_i
        n_hidden = n_i = n_c = n_o = n_f = nh
        n_y = nc
        print "n_y", n_y
        print "n_hidden, n_i, n_c, n_o,nh", n_hidden, n_i, n_c, n_o, nh

        self.W_xi = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_i)).astype(dtype))
        self.W_hi = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_i)).astype(dtype))
        self.W_ci = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_c, n_i)).astype(dtype))
        self.b_i = theano.shared(numpy.cast[dtype](uniform(-0.5, .5,
                                                           size=n_i)))
        self.W_xf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_f)).astype(dtype))
        self.W_hf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_f)).astype(dtype))
        self.W_cf = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_c, n_f)).astype(dtype))
        self.b_f = theano.shared(numpy.cast[dtype](uniform(0, 1., size=n_f)))
        self.W_xc = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_c)).astype(dtype))
        self.W_hc = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_c)).astype(dtype))
        self.b_c = theano.shared(numpy.zeros(n_c, dtype=dtype))
        self.W_xo = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_in, n_o)).astype(dtype))
        self.W_ho = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_hidden, n_o)).astype(dtype))
        self.W_co = theano.shared(0.2 * uniform(-1.0, 1.0,
                                                (n_c, n_o)).astype(dtype))
        self.b_o = theano.shared(numpy.cast[dtype](uniform(-0.5, .5,
                                                           size=n_o)))
        self.W_hy = theano.shared(
            0.2 * uniform(-1.0, 1.0, (n_hidden + featdim, n_y)).astype(dtype))
        self.b_y = theano.shared(numpy.zeros(n_y, dtype=dtype))

        self.c0 = theano.shared(numpy.zeros(n_hidden, dtype=dtype))
        self.h0 = T.tanh(self.c0)

        # bundle weights
        self.params = [self.emb, self.W_xi, self.W_hi, self.W_ci, self.b_i, self.W_xf, self.W_hf, \
                       self.W_cf, self.b_f, self.W_xc, self.W_hc, self.b_c, self.W_xo, self.W_ho, \
                       self.W_co, self.b_o, self.W_hy, self.b_y, self.c0]
        self.names  = ['embeddings', 'W_xi', 'W_hi', 'W_ci', 'b_i', 'W_xf', 'W_hf', 'W_cf', 'b_f', \
                       'W_xc', 'W_hc', 'b_c', 'W_xo', 'W_ho', 'W_co', 'b_o', 'W_hy', 'b_y', 'c0']

        idxs = T.imatrix(
        )  # as many columns as context window size/lines as words in the sentence
        # print idxs.shape()
        x = self.emb[idxs].reshape((idxs.shape[0], de * cs))
        # print type(x), x.shape(), "details of x"
        f = T.matrix('f')
        f.reshape((idxs.shape[0], featdim))
        # print type(f), f.shape(), "details of f"
        y = T.iscalar('y')  # label

        # print type(y), y.shape(), "details of y"

        def recurrence(x_t, feat_t, h_tm1, c_tm1):
            i_t = sigma(
                theano.dot(x_t, self.W_xi) + theano.dot(h_tm1, self.W_hi) +
                theano.dot(c_tm1, self.W_ci) + self.b_i)
            f_t = sigma(
                theano.dot(x_t, self.W_xf) + theano.dot(h_tm1, self.W_hf) +
                theano.dot(c_tm1, self.W_cf) + self.b_f)
            c_t = f_t * c_tm1 + i_t * T.tanh(
                theano.dot(x_t, self.W_xc) + theano.dot(h_tm1, self.W_hc) +
                self.b_c)
            o_t = sigma(
                theano.dot(x_t, self.W_xo) + theano.dot(h_tm1, self.W_ho) +
                theano.dot(c_t, self.W_co) + self.b_o)
            h_t = o_t * T.tanh(c_t)

            if self.featdim > 0:
                all_t = T.concatenate([h_t, feat_t])
            else:
                all_t = h_t

            # print "all_t", type(all_t), T.shape(all_t)
            s_t = softmax(theano.dot(all_t, self.W_hy) + self.b_y)
            # print T.shape(h_t), T.shape(c_t), T.shape(s_t)
            return [h_t, c_t, s_t]

        # Initialization occurs in outputs_info
        # scan gives -- result, updates
        [h, _, s], _ = theano.scan(fn=recurrence,
                                   sequences=[x, f],
                                   outputs_info=[self.h0, self.c0, None],
                                   n_steps=x.shape[0])

        p_y_given_x_lastword = s[-1, 0, :]
        p_y_given_x_sentence = s[:, 0, :]
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)

        # cost and gradients and learning rate
        lr = T.scalar('lr')
        nll = -T.mean(T.log(p_y_given_x_lastword)[y])
        gradients = T.grad(nll, self.params)
        updates = OrderedDict(
            (p, p - lr * g) for p, g in zip(self.params, gradients))

        # theano functions
        self.classify = theano.function(inputs=[idxs, f], outputs=y_pred)

        self.train = theano.function(inputs=[idxs, f, y, lr],
                                     outputs=nll,
                                     updates=updates)

        self.normalize = theano.function(
            inputs=[],
            updates={
                self.emb:
                self.emb / T.sqrt(
                    (self.emb**2).sum(axis=1)).dimshuffle(0, 'x')
            })
Example #50
0
    def test_grad_h(self):

        "tests that the gradients with respect to h_i are 0 after doing a mean field update of h_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        init_H = e_step.init_H_hat(V=X)
        init_Mu1 = e_step.init_S_hat(V=X)

        prev_setting = config.compute_test_value
        config.compute_test_value = 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(
            -5., 5., Mu1.shape))

        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0

        new_H = e_step.infer_H_hat(V=X, H_hat=H_var, S_hat=Mu1_var)
        h_idx = new_H[:, idx]

        updates_func = function([H_var, Mu1_var, idx], h_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var,  var_s0_hat = sigma0,
                             var_s1_hat = Sigma1)

        grad_H = T.grad(trunc_kl.sum(), H_var)

        assert len(grad_H.type.broadcastable) == 2

        #from theano.printing import min_informative_str
        #print min_informative_str(grad_H)

        #grad_H = Print('grad_H')(grad_H)

        #grad_H_idx = grad_H[:,idx]

        grad_func = function([H_var, Mu1_var], grad_H)

        failed = False

        for i in xrange(self.N):
            rval = updates_func(H, Mu1, i)
            H[:, i] = rval

            g = grad_func(H, Mu1)[:, i]

            assert not contains_nan(g)

            g_abs_max = np.abs(g).max()

            if g_abs_max > self.tol:
                #print "new values of H"
                #print H[:,i]
                #print "gradient on new values of H"
                #print g

                failed = True

                print 'iteration ', i
                #print 'max value of new H: ',H[:,i].max()
                #print 'H for failing g: '
                failing_h = H[np.abs(g) > self.tol, i]
                #print failing_h

                #from matplotlib import pyplot as plt
                #plt.scatter(H[:,i],g)
                #plt.show()

                #ignore failures extremely close to h=1

                high_mask = failing_h > .001
                low_mask = failing_h < .999

                mask = high_mask * low_mask

                print 'masked failures: ', mask.shape[0], ' err ', g_abs_max

                if mask.sum() > 0:
                    print 'failing h passing the range mask'
                    print failing_h[mask.astype(bool)]
                    raise Exception(
                        'after mean field step, gradient of kl divergence'
                        ' wrt freshly updated variational parameter should be 0, '
                        'but here the max magnitude of a gradient element is '
                        + str(g_abs_max) + ' after updating h_' + str(i))
    def __init__(self,
                 word_dim,
                 hidden_dim=5,
                 Nclass=4,
                 degree=2,
                 momentum=0.9,
                 trainable_embeddings=True,
                 labels_on_nonroot_nodes=False,
                 irregular_tree=True):
        assert word_dim > 1 and hidden_dim > 1
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.Nclass = Nclass
        self.degree = degree
        #self.learning_rate = learning_rate
        self.momentum = momentum
        self.irregular_tree = irregular_tree

        self.params = []

        #self.x = T.ivector(name='x')  # word indices
        #self.x_word = T.matrix(dtype=theano.config.floatX)  # word frequendtype=theano.config.floatX
        self.x_word = T.matrix(name='x_word')  # word frequent
        self.x_index = T.imatrix(name='x_index')  # word indices
        self.tree = T.imatrix(name='tree')  # shape [None, self.degree]
        self.y = T.ivector(name='y')  # output shape [self.output_dim]
        self.num_parent = T.iscalar(name='num_parent')
        self.num_nodes = self.x_word.shape[
            0]  # total number of nodes (leaves + internal) in tree
        self.num_child = self.num_nodes - self.num_parent - 1
        #emb_x = self.embeddings[self.x]
        #emb_x = emb_x * T.neq(self.x, -1).dimshuffle(0, 'x')  # zero-out non-existent embeddings

        self.tree_states = self.compute_tree(self.x_word, self.x_index,
                                             self.num_parent, self.tree)
        #self.final_state = self.tree_states.mean(axis=0)#self.tree_states[-1]
        #self.final_state = pool_2d(input=self.tree_states, ds=(self.num_child,1), ignore_border=True,mode='max')
        self.final_state = self.tree_states.max(axis=0)
        self.output_fn = self.create_output_fn()
        self.pred_y = self.output_fn(self.final_state)
        self.loss = self.loss_fn(self.y, self.pred_y)

        self.learning_rate = T.scalar('learning_rate')
        #updates = self.gradient_descent(self.loss, self.learning_rate)
        train_inputs = [
            self.x_word, self.x_index, self.num_parent, self.tree, self.y,
            self.learning_rate
        ]
        updates = self.gradient_descent(self.loss)

        #train_inputs = [self.x_word, self.x_index, self.tree, self.y]
        self._train = theano.function(train_inputs, [self.loss, self.pred_y],
                                      updates=updates)

        self._evaluate = theano.function(
            [self.x_word, self.x_index, self.num_parent, self.tree],
            self.final_state)
        self._evaluate2 = theano.function(
            [self.x_word, self.x_index, self.num_parent, self.tree],
            self.tree_states)
        #self._state = theano.function([self.x_word, self.x_index, self.num_child, self.tree], self.tree_states)

        self._predict = theano.function(
            [self.x_word, self.x_index, self.num_parent, self.tree],
            self.pred_y)

        self.tree_states_test = self.compute_tree_test(self.x_word,
                                                       self.x_index, self.tree)
        self._evaluate3 = theano.function(
            [self.x_word, self.x_index, self.tree], self.tree_states_test)
Example #52
0
def main(n_iter, n_batch, n_hidden, time_steps, learning_rate, savefile,
         scale_penalty, use_scale, reload_progress, model, n_hidden_lstm,
         n_gru_lr_proj, initial_b_u):

    np.random.seed(1234)
    #import pdb; pdb.set_trace()
    # --- Set optimization params --------

    # --- Set data params ----------------
    n_input = 1
    n_output = 10
    ##### MNIST processing ################################################

    # load and preprocess the data
    (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = cPickle.load(
        gzip.open("mnist.pkl.gz", 'rb'))
    n_data = train_x.shape[0]
    num_batches = n_data / n_batch

    # shuffle data order
    inds = range(n_data)
    np.random.shuffle(inds)
    train_x = np.ascontiguousarray(train_x[inds, :time_steps])
    train_y = np.ascontiguousarray(train_y[inds])
    n_data_valid = valid_x.shape[0]
    inds_valid = range(n_data_valid)
    np.random.shuffle(inds_valid)
    valid_x = np.ascontiguousarray(valid_x[inds_valid, :time_steps])
    valid_y = np.ascontiguousarray(valid_y[inds_valid])

    # reshape x
    train_x = np.reshape(train_x.T, (time_steps, n_data, 1))
    valid_x = np.reshape(valid_x.T, (time_steps, valid_x.shape[0], 1))

    # change y to one-hot encoding
    temp = np.zeros((n_data, n_output))
    # import pdb; pdb.set_trace()
    temp[np.arange(n_data), train_y] = 1
    train_y = temp.astype('float32')

    temp = np.zeros((n_data_valid, n_output))
    temp[np.arange(n_data_valid), valid_y] = 1
    valid_y = temp.astype('float32')

    # Random permutation of pixels
    P = np.random.permutation(time_steps)
    train_x = train_x[P, :, :]
    valid_x = valid_x[P, :, :]

    #######################################################################

    # --- Compile theano graph and gradients

    gradient_clipping = np.float32(1)
    if (model == 'LSTM'):
        #inputs, parameters, costs = LSTM(n_input, n_hidden_LSTM, n_output)
        inputs, parameters, costs = LSTM(n_input,
                                         n_hidden_lstm,
                                         n_output,
                                         initial_b_f=initial_b_u)

    #by AnvaMiba
    elif (model == 'GRU'):
        inputs, parameters, costs = GRU(n_input,
                                        n_hidden_lstm,
                                        n_output,
                                        initial_b_u=initial_b_u)

    #by AnvaMiba
    elif (model == 'GRU_LR'):
        inputs, parameters, costs = GRU_LR(n_input,
                                           n_hidden_lstm,
                                           n_output,
                                           n_gru_lr_proj,
                                           initial_b_u=initial_b_u)

    elif (model == 'complex_RNN'):
        gradient_clipping = np.float32(100000)
        inputs, parameters, costs = complex_RNN(n_input, n_hidden, n_output,
                                                scale_penalty)
    elif (model == 'complex_RNN_LSTM'):
        inputs, parameters, costs = complex_RNN_LSTM(n_input, n_hidden,
                                                     n_hidden_lstm, n_output,
                                                     scale_penalty)
    elif (model == 'IRNN'):
        inputs, parameters, costs = IRNN(n_input, n_hidden, n_output)
    elif (model == 'RNN'):
        inputs, parameters, costs = RNN(n_input, n_hidden, n_output)
    else:
        print >> sys.stderr, "Unsuported model:", model
        return

    gradients = T.grad(costs[0], parameters)

    #   GRADIENT CLIPPING
    gradients = gradients[:7] + [
        T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients[7:]
    ]

    s_train_x = theano.shared(train_x)
    s_train_y = theano.shared(train_y)

    s_valid_x = theano.shared(valid_x)
    s_valid_y = theano.shared(valid_y)

    # --- Compile theano functions --------------------------------------------------

    index = T.iscalar('i')

    updates, rmsprop = rms_prop(learning_rate, parameters, gradients)

    givens = {
        inputs[0]: s_train_x[:, n_batch * index:n_batch * (index + 1), :],
        inputs[1]: s_train_y[n_batch * index:n_batch * (index + 1), :]
    }

    givens_valid = {inputs[0]: s_valid_x, inputs[1]: s_valid_y}

    train = theano.function([index], [costs[0], costs[2]],
                            givens=givens,
                            updates=updates)
    valid = theano.function([], [costs[1], costs[2]], givens=givens_valid)

    #import pdb; pdb.set_trace()

    # --- Training Loop ---------------------------------------------------------------
    train_loss = []
    test_loss = []
    test_acc = []
    best_params = [p.get_value() for p in parameters]
    best_test_loss = 1e6
    for i in xrange(n_iter):
        #   pdb.set_trace()

        [cross_entropy, acc] = train(i % num_batches)
        train_loss.append(cross_entropy)
        print >> sys.stderr, "Iteration:", i
        print >> sys.stderr, "cross_entropy:", cross_entropy
        print >> sys.stderr, "accurracy", acc * 100
        print >> sys.stderr, ''

        #if (i % 100==0):
        if (i % 300 == 0):
            [valid_cross_entropy, valid_acc] = valid()
            print >> sys.stderr, ''
            print >> sys.stderr, "VALIDATION"
            print >> sys.stderr, "cross_entropy:", valid_cross_entropy
            print >> sys.stderr, "accurracy", valid_acc * 100
            print >> sys.stderr, ''
            test_loss.append(valid_cross_entropy)
            test_acc.append(valid_acc)

            if valid_cross_entropy < best_test_loss:
                print >> sys.stderr, "NEW BEST!"
                best_params = [p.get_value() for p in parameters]
                best_test_loss = valid_cross_entropy

            save_vals = {
                'parameters': [p.get_value() for p in parameters],
                'rmsprop': [r.get_value() for r in rmsprop],
                'train_loss': train_loss,
                'test_loss': test_loss,
                'best_params': best_params,
                'test_acc': test_acc,
                'best_test_loss': best_test_loss
            }

            cPickle.dump(save_vals, file(savefile, 'wb'),
                         cPickle.HIGHEST_PROTOCOL)
Example #53
0
    def build(self,
              dropout,
              char_dim,
              char_hidden_dim,
              char_bidirect,
              word_dim,
              word_hidden_dim,
              word_bidirect,
              tagger_hidden_dim,
              hamming_cost,
              L2_reg,
              lr_method,
              pre_word_emb,
              pre_char_emb,
              tagger,
              use_gaze,
              POS,
              # plot_cost,
              #cap_dim,
              training=True,
              **kwargs
              ):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_chars = len(self.id_to_char)
        n_tags = len(self.id_to_tag)
        # n_pos = len(self.id_to_pos) + 1

        # Number of capitalization features
        #if cap_dim:
        #    n_cap = 4

        # Network variables
        is_train = T.iscalar('is_train') # declare variable,声明整型变量is_train
        word_ids = T.ivector(name='word_ids') #声明整型一维向量
        char_for_ids = T.imatrix(name='char_for_ids') # 声明整型二维矩阵
        char_rev_ids = T.imatrix(name='char_rev_ids')
        char_pos_ids = T.ivector(name='char_pos_ids')
        if use_gaze:
            gaze = T.imatrix(name='gaze')
        if POS:
            # pos_ids = T.ivector(name='pos_ids')
            pos_one_hot = T.imatrix(name= 'pos_one_hot')
        #hamming_cost = T.matrix('hamming_cost', theano.config.floatX) # 声明整型二维矩阵
        tag_ids = T.ivector(name='tag_ids')
        #if cap_dim:
        #    cap_ids = T.ivector(name='cap_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]  #句子中的单词数

        # Final input (all word features)
        input_dim = 0
        inputs = []
        L2_norm = 0.0

        theano.config.compute_test_value = 'off'
        #
        # Word inputs
        #
        if word_dim:
            print("word_dim:", word_dim)
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)
            # Initialize with pretrained embeddings
            if pre_word_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained word embeddings from %s...' % pre_word_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_word_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]
                        ).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid word embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        new_weights[i] = pretrained[word]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', word) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', word)
                        ]
                        c_zeros += 1
                word_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained word embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained word embeddings.') % (
                            c_found + c_lower + c_zeros, n_words,
                            100. * (c_found + c_lower + c_zeros) / n_words
                      )
                print ('%i found directly, %i after lowercasing + zero.') % (c_found, c_lower + c_zeros)
            L2_norm += (word_layer.embeddings ** 2).sum()


        #
        # Chars inputs
        #
        if char_dim:
            print("char_dim:", char_dim)
            input_dim += char_dim
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            char_for_input = char_layer.link(char_for_ids)
        
            # Initialize with pretrained char embeddings
            if pre_char_emb and training:
                new_weights = char_layer.embeddings.get_value()
                print 'Loading pretrained char embeddings from %s...' % pre_char_emb
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_char_emb, 'r', 'utf-8', 'ignore')):
                    line = line.rstrip().split()
                    if len(line) == char_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]
                        ).astype(np.float32)
                    else:
                        emb_invalid += 1
                if emb_invalid > 0:
                    print 'WARNING: %i invalid char embedding lines' % emb_invalid
                c_found = 0
                c_lower = 0
                c_zeros = 0
                # Lookup table initialization
                for i in xrange(n_chars):
                    char = self.id_to_char[i]
                    if char in pretrained:
                        new_weights[i] = pretrained[char]
                        c_found += 1
                    elif word.lower() in pretrained:
                        new_weights[i] = pretrained[word.lower()]
                        c_lower += 1
                    elif re.sub('\d', '0', char) in pretrained:
                        new_weights[i] = pretrained[
                            re.sub('\d', '0', char)
                        ]
                        c_zeros += 1
                char_layer.embeddings.set_value(new_weights)
                print 'Loaded %i pretrained char embeddings.' % len(pretrained)
                print ('%i / %i (%.4f%%) words have been initialized with '
                       'pretrained char embeddings.') % (
                            c_found + c_lower + c_zeros, n_chars,
                            100. * (c_found + +c_lower + c_zeros) / n_chars
                      )
                print ('%i found directly, %i after lowercasing + zero.') % (c_found, c_lower + c_zeros)
            L2_norm += (char_layer.embeddings ** 2).sum()

            wc_layer = CW_EmbeddingLayer(char_dim, word_dim + char_dim, bias= True, name= 'wc_layer')
            wc_comp_input = wc_layer.link(char_for_input, word_input)

            for param in wc_layer.params:
                L2_norm += (param ** 2).sum()

            print(word_input.ndim)
            print(wc_comp_input.ndim)
            # new_word_input, _ = theano.scan(lambda x_t, y_t: T.max([x_t, y_t], axis= 0), sequences= [word_input, wc_comp_input], n_steps= word_input.shape[0])
            # print(new_word_input.ndim)
            inputs.append(wc_comp_input)
            
        # if POS:
            # pos_dim = 20
            # input_dim += pos_dim
            # pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer')
            # pos_input = pos_layer.link(pos_ids)
            # inputs.append(pos_input)
            # L2_norm += (pos_layer.embeddings ** 2).sum()


        #if len(inputs) != 1:
        inputs = T.concatenate(inputs, axis= 1)

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # 条件句

        # if POS:
        #     inputs = T.concatenate([inputs, pos_one_hot], axis= 1)
        #     input_dim += 6

        # LSTM for words
        print("input_dim:", input_dim)
        print("word_hidden_dim:", word_hidden_dim)
        word_lstm_for = LSTM(input_dim, word_hidden_dim, with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim, word_hidden_dim, with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)          # 单词的顺序: I like dog
        word_lstm_rev.link(inputs[::-1, :]) # 单词的顺序: dog like I
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]

        for param in word_lstm_for.params[:8]:
            L2_norm += (param ** 2).sum()

        if word_bidirect:
            final_output = T.concatenate(
                [word_for_output, word_rev_output],
                axis=1
            )

            tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim,
                                     name='tanh_layer', activation='tanh')
            final_output = tanh_layer.link(final_output)
            for param in word_lstm_rev.params[:8]:
                L2_norm += (param ** 2).sum()

        else:
            final_output = word_for_output


        dims = word_hidden_dim
        if use_gaze:
            final_output = T.concatenate([final_output, gaze], axis= 1)
            dims = word_hidden_dim + n_tags


        if POS:
            final_output = T.concatenate([final_output, pos_one_hot], axis=1)
            dims += 6




        # if word_bidirect:
        #     final_output = T.concatenate(
        #         [word_for_output, word_rev_output],
        #         axis=1
        #     )
        #     tanh_layer = HiddenLayer(2 * word_hidden_dim, word_hidden_dim,
        #                              name='tanh_layer', activation='tanh')
        #     final_output = tanh_layer.link(final_output)
        # else:
        #     final_output = word_for_output

        # Sentence to Named Entity tags
        ## final_layer = HiddenLayer(dims, n_tags, name='final_layer',
        ##                           activation=(None if crf else 'softmax'))
        # final_layer = HiddenLayer(word_hidden_dim, n_tags, name='final_layer',
        #                           activation=(None if crf else 'softmax'))
        ## tags_scores = final_layer.link(final_output)
        ## L2_norm += (final_layer.params[0] ** 2).sum()

        # No CRF
        if tagger == 'lstm':
            tagger_layer = LSTM_d(dims, tagger_hidden_dim, with_batch= False, name='LSTM_d')
            tagger_layer.link(final_output)
            final_output = tagger_layer.t

            dims = tagger_hidden_dim

            for param in tagger_layer.params[:8]:
                L2_norm += (param ** 2).sum()

        final_layer = HiddenLayer(dims, n_tags, name='final_layer',
                                  activation=(None if tagger == 'crf' else 'softmax'))
        tags_scores = final_layer.link(final_output)
        L2_norm += (final_layer.params[0] ** 2).sum()

        if tagger != 'crf':
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')

            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))],
                axis=1
            )
            observations = T.concatenate(
                [b_s, observations, e_s],
                axis=0
            )

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # P中对应元素的求和好

            # Score from add_componentnsitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[
                padded_tags_ids[T.arange(s_len + 1)],
                padded_tags_ids[T.arange(s_len + 1) + 1]
            ].sum()  # A中对应元素的求和
            all_paths_scores = forward(observations, transitions, hamming_cost=hamming_cost, n_tags=n_tags, padded_tags_ids=padded_tags_ids)
            L2_norm += (transitions ** 2).sum()
            cost = - (real_path_score - all_paths_scores) + L2_reg * L2_norm

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if char_dim:
            self.add_component(wc_layer)
            params.extend(wc_layer.params)

        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)

        # if POS:
        #     self.add_component(pos_layer)
        #     params.extend(pos_layer.params)

        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)


        self.add_component(final_layer)
        params.extend(final_layer.params)

        if tagger == 'lstm':
            self.add_component(tagger_layer)
            params.extend(tagger_layer.params)
        elif tagger == 'crf':
            self.add_component(transitions)
            params.append(transitions)

        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)



        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if use_gaze:
            eval_inputs.append(gaze)
        if POS:
            # eval_inputs.append(pos_ids)
            eval_inputs.append(pos_one_hot)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_bidirect:
                eval_inputs.append(char_rev_ids)
            eval_inputs.append(char_pos_ids)
        #if cap_dim:
        #    eval_inputs.append(cap_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}



        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(
                inputs=train_inputs,
                outputs=cost,
                updates=updates,
                givens=({is_train: np.cast['int32'](1)} if dropout else {}),
                on_unused_input='warn'
            )
        else:
            f_train = None

        # if plot_cost:
        #     f_plot_cost = theano.function(
        #         inputs=train_inputs,
        #         outputs=cost,
        #         givens=({is_train: np.cast['int32'](1)} if dropout else {}),
        #         on_unused_input='warn'
        #     )
        # else:
        #     f_plot_cost = None

        # Compile evaluation function
        if tagger != 'crf':
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=tags_scores,
                givens=({is_train: np.cast['int32'](0)} if dropout else {}),
                on_unused_input='warn'
            )
        else:
            f_eval = theano.function(
                inputs=eval_inputs,
                outputs=forward(observations, transitions, hamming_cost= 0, n_tags= None, padded_tags_ids= None, viterbi=True,
                                return_alpha=False, return_best_sequence=True),
                givens=({is_train: np.cast['int32'](0)} if dropout else {}),
                on_unused_input='warn'
            )

        return f_train, f_eval#, f_plot_cost
Example #54
0
    def __init__(self, nh, nc, ne, de, cs):
        '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary  572
        de :: dimension of the word embeddings    100
        cs :: word window context size 
        '''
        # parameters of the model
        self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end
        self.Wx  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (de * cs, nh)).astype(theano.config.floatX))
        self.Wh  = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (nh, nh)).astype(theano.config.floatX))
        self.W   = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\
                   (nh, nc)).astype(theano.config.floatX))
        self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))
        self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX))
        self.h0 = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX))

        # bundle
        self.params = [
            self.emb, self.Wx, self.Wh, self.W, self.bh, self.b, self.h0
        ]
        self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 'h0']
        idxs = T.imatrix(
        )  # as many columns as context window size/lines as words in the sentence
        x = self.emb[idxs].reshape((idxs.shape[0], de * cs))
        y = T.iscalar('y')  # label

        def Relu(x):
            out_dtype = scalar.upgrade_to_float(
                scalar.Scalar(dtype=x.dtype))[0].dtype
            a = T.constant(0.5, dtype=out_dtype)
            # ab = T.constant(abs(x), dtype=out_dtype)
            # x = (x * slope) + shift
            y = (x + abs(x)) * a
            r = T.clip(y, 0, 1)
            return r

        def PRelu(x):
            out_dtype = scalar.upgrade_to_float(
                scalar.Scalar(dtype=x.dtype))[0].dtype
            a = T.constant(0.625, dtype=out_dtype)
            b = T.constant(0.375, dtype=out_dtype)
            # x = (x * slope) + shift
            y = x * a + abs(x) * b
            r = T.clip(y, 0, 1)
            return r

        def my_tanh(x):
            #return 2*T.nnet.sigmoid(2*x)-1
            return T.nnet.sigmoid(x)

        def sigmoid_sigmoid(x):
            return 0.8 * T.nnet.sigmoid(x) + 0.2 * T.nnet.hard_sigmoid(x)

        def recurrence(x_t, h_tm1):
            temp = T.dot(x_t, self.Wx) + T.dot(h_tm1, self.Wh) + self.bh
            #h_t = T.nnet.hard_sigmoid(temp)  # the t moment output of the hidden layer
            #h_t = T.tanh(temp)

            h_t = T.nnet.sigmoid(temp)

            #h_t=T.nnet.relu(temp,0.2)#relu=T.maximum(0, temp)
            s_t = T.nnet.softmax(
                T.dot(h_t, self.W) +
                self.b)  # the t moment output of the output layer
            return [h_t, s_t]

        [h, s], _ = theano.scan(fn=recurrence, \
            sequences=x, outputs_info=[self.h0, None], \
            n_steps=x.shape[0])

        p_y_given_x_lastword = s[-1, 0, :]
        p_y_given_x_sentence = s[:, 0, :]
        y_pred = T.argmax(p_y_given_x_sentence, axis=1)
        #print 'y_pred', y_pred
        #print ' p_y_given_x_sentence', p_y_given_x_sentence

        # cost and gradients and learning rate
        lr = T.scalar('lr')
        nll = -T.log(p_y_given_x_lastword)[y]  #negative log-likelihood(NLL)
        gradients = T.grad(nll, self.params)
        updates = OrderedDict(
            (p, p - lr * g) for p, g in zip(self.params, gradients))

        # theano functions
        self.myclassify = theano.function(inputs=[idxs],
                                          outputs=p_y_given_x_sentence)
        self.classify = theano.function(inputs=[idxs], outputs=y_pred)

        self.train = theano.function(inputs=[idxs, y, lr],
                                     outputs=nll,
                                     updates=updates)

        self.normalize = theano.function( inputs = [],
                         updates = {self.emb:\
                         self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')})
Example #55
0
def main(n_iter, n_batch, n_hidden, time_steps, learning_rate, savefile, scale_penalty, use_scale,
         model, n_hidden_lstm, loss_function):

    #import pdb; pdb.set_trace()
 
    # --- Set optimization params --------
    gradient_clipping = np.float32(50000)

    # --- Set data params ----------------
    n_input = 2
    n_output = 1
  

    # --- Manage data --------------------
    n_train = 1e5
    n_test = 1e4
    num_batches = n_train / n_batch
    
    train_x = np.asarray(np.zeros((time_steps, n_train, 2)),
                         dtype=theano.config.floatX)
    

    train_x[:,:,0] = np.asarray(np.random.uniform(low=0.,
                                                  high=1.,
                                                  size=(time_steps, n_train)),
                                dtype=theano.config.floatX)
    
#    inds = np.asarray([np.random.choice(time_steps, 2, replace=False) for i in xrange(train_x.shape[1])])    
    inds = np.asarray(np.random.randint(time_steps/2, size=(train_x.shape[1],2)))
    inds[:, 1] += time_steps/2  
    
    for i in range(train_x.shape[1]):
        train_x[inds[i, 0], i, 1] = 1.0
        train_x[inds[i, 1], i, 1] = 1.0
 
    train_y = (train_x[:,:,0] * train_x[:,:,1]).sum(axis=0)
    train_y = np.reshape(train_y, (n_train, 1))

    test_x = np.asarray(np.zeros((time_steps, n_test, 2)),
                        dtype=theano.config.floatX)
    

    test_x[:,:,0] = np.asarray(np.random.uniform(low=0.,
                                                 high=1.,
                                                 size=(time_steps, n_test)),
                                dtype=theano.config.floatX)
    
    inds = np.asarray([np.random.choice(time_steps, 2, replace=False) for i in xrange(test_x.shape[1])])    
    for i in range(test_x.shape[1]):
        test_x[inds[i, 0], i, 1] = 1.0
        test_x[inds[i, 1], i, 1] = 1.0
 
   
    test_y = (test_x[:,:,0] * test_x[:,:,1]).sum(axis=0)
    test_y = np.reshape(test_y, (n_test, 1)) 


   #######################################################################

    gradient_clipping = np.float32(1)

    if (model == 'LSTM'):   
        inputs, parameters, costs = LSTM(n_input, n_hidden_lstm, n_output, loss_function=loss_function)
        gradients = T.grad(costs[0], parameters)
        gradients = [T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients]

    elif (model == 'complex_RNN'):
        inputs, parameters, costs = complex_RNN(n_input, n_hidden, n_output, scale_penalty, loss_function=loss_function)
        if use_scale is False:
            parameters.pop()
        gradients = T.grad(costs[0], parameters)

    elif (model == 'complex_RNN_LSTM'):
        inputs, parameters, costs = complex_RNN_LSTM(n_input, n_hidden, n_hidden_lstm, n_output, scale_penalty, loss_function=loss_function)

    elif (model == 'IRNN'):
        inputs, parameters, costs = IRNN(n_input, n_hidden, n_output, loss_function=loss_function)
        gradients = T.grad(costs[0], parameters)
        gradients = [T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients]

    elif (model == 'RNN'):
        inputs, parameters, costs = tanhRNN(n_input, n_hidden, n_output, loss_function=loss_function)
        gradients = T.grad(costs[0], parameters)
        gradients = [T.clip(g, -gradient_clipping, gradient_clipping) for g in gradients]

    else:
        print "Unsuported model:", model
        return
 

   




    s_train_x = theano.shared(train_x)
    s_train_y = theano.shared(train_y)

    s_test_x = theano.shared(test_x)
    s_test_y = theano.shared(test_y)


    # --- Compile theano functions --------------------------------------------------

    index = T.iscalar('i')

    updates, rmsprop = rms_prop(learning_rate, parameters, gradients)

    givens = {inputs[0] : s_train_x[:, n_batch * index : n_batch * (index + 1), :],
              inputs[1] : s_train_y[n_batch * index : n_batch * (index + 1), :]}

    givens_test = {inputs[0] : s_test_x,
                   inputs[1] : s_test_y}
    
   
    
    train = theano.function([index], costs[0], givens=givens, updates=updates)
    test = theano.function([], costs[1], givens=givens_test)

    # --- Training Loop ---------------------------------------------------------------

    # f1 = file('/data/lisatmp3/shahamar/adding/complexRNN_400.pkl', 'rb')
    # data1 = cPickle.load(f1)
    # f1.close()
    # train_loss = data1['train_loss']
    # test_loss = data1['test_loss']
    # best_params = data1['best_params']
    # best_test_loss = data1['best_test_loss']

    # for i in xrange(len(parameters)):
    #     parameters[i].set_value(data1['parameters'][i])

    # for i in xrange(len(parameters)):
    #     rmsprop[i].set_value(data1['rmsprop'][i])
    
#    import pdb; pdb.set_trace()

    train_loss = []
    test_loss = []
    best_params = [p.get_value() for p in parameters]
    best_test_loss = 1e6
    for i in xrange(n_iter):
#        start_time = timeit.default_timer()


        if (n_iter % int(num_batches) == 0):
            #import pdb; pdb.set_trace()
            inds = np.random.permutation(int(n_train))
            data_x = s_train_x.get_value()
            s_train_x.set_value(data_x[:,inds,:])
            data_y = s_train_y.get_value()
            s_train_y.set_value(data_y[inds,:])


        mse = train(i % int(num_batches))
        train_loss.append(mse)
        print "Iteration:", i
        print "mse:", mse
        print

        if (i % 50==0):
            mse = test()
            print
            print "TEST"
            print "mse:", mse
            print 
            test_loss.append(mse)

            if mse < best_test_loss:
                best_params = [p.get_value() for p in parameters]
                best_test_loss = mse

            
            save_vals = {'parameters': [p.get_value() for p in parameters],
                         'rmsprop': [r.get_value() for r in rmsprop],
                         'train_loss': train_loss,
                         'test_loss': test_loss,
                         'best_params': best_params,
                         'best_test_loss': best_test_loss,
                         'model': model,
                         'time_steps': time_steps}

            cPickle.dump(save_vals,
                         file(savefile, 'wb'),
                         cPickle.HIGHEST_PROTOCOL)
Example #56
0
def test_notex_print():

    tt_normalrv_noname_expr = tt.scalar("b") * NormalRV(
        tt.scalar("\\mu"), tt.scalar("\\sigma"))
    expected = textwrap.dedent(r"""
    b in R, \mu in R, \sigma in R
    a ~ N(\mu, \sigma**2) in R
    (b * a)
    """)
    assert tt_pprint(tt_normalrv_noname_expr) == expected.strip()

    # Make sure the constant shape is show in values and not symbols.
    tt_normalrv_name_expr = tt.scalar("b") * NormalRV(
        tt.scalar("\\mu"), tt.scalar("\\sigma"), size=[2, 1], name="X")
    expected = textwrap.dedent(r"""
    b in R, \mu in R, \sigma in R
    X ~ N(\mu, \sigma**2) in R**(2 x 1)
    (b * X)
    """)
    assert tt_pprint(tt_normalrv_name_expr) == expected.strip()

    tt_2_normalrv_noname_expr = tt.matrix("M") * NormalRV(
        tt.scalar("\\mu_2"), tt.scalar("\\sigma_2"))
    tt_2_normalrv_noname_expr *= tt.scalar("b") * NormalRV(
        tt_2_normalrv_noname_expr, tt.scalar("\\sigma")) + tt.scalar("c")
    expected = textwrap.dedent(r"""
    M in R**(N^M_0 x N^M_1), \mu_2 in R, \sigma_2 in R
    b in R, \sigma in R, c in R
    a ~ N(\mu_2, \sigma_2**2) in R, d ~ N((M * a), \sigma**2) in R**(N^d_0 x N^d_1)
    ((M * a) * ((b * d) + c))
    """)
    assert tt_pprint(tt_2_normalrv_noname_expr) == expected.strip()

    expected = textwrap.dedent(r"""
    b in Z, c in Z, M in R**(N^M_0 x N^M_1)
    M[b, c]
    """)
    # TODO: "c" should be "1".
    assert (tt_pprint(
        tt.matrix("M")[tt.iscalar("a"),
                       tt.constant(1, dtype="int")]) == expected.strip())

    expected = textwrap.dedent(r"""
    M in R**(N^M_0 x N^M_1)
    M[1]
    """)
    assert tt_pprint(tt.matrix("M")[1]) == expected.strip()

    expected = textwrap.dedent(r"""
    M in N**(N^M_0)
    M[2:4:0]
    """)
    assert tt_pprint(tt.vector("M", dtype="uint32")[0:4:2]) == expected.strip()

    norm_rv = NormalRV(tt.scalar("\\mu"), tt.scalar("\\sigma"))
    rv_obs = observed(tt.constant(1.0, dtype=norm_rv.dtype), norm_rv)

    expected = textwrap.dedent(r"""
    \mu in R, \sigma in R
    a ~ N(\mu, \sigma**2) in R
    a = 1.0
        """)
    assert tt_pprint(rv_obs) == expected.strip()
Example #57
0
    def __init__(self,
                 n_in,
                 hidden_layer_size,
                 n_out,
                 L1_reg,
                 L2_reg,
                 hidden_layer_type,
                 output_type='LINEAR',
                 dropout_rate=0.0,
                 optimizer='sgd',
                 loss_function='MMSE',
                 rnn_batch_training=False):
        """ This function initialises a neural network

        :param n_in: Dimensionality of input features
        :type in: Integer
        :param hidden_layer_size: The layer size for each hidden layer
        :type hidden_layer_size: A list of integers
        :param n_out: Dimensionality of output features
        :type n_out: Integrer
        :param hidden_layer_type: the activation types of each hidden layers, e.g., TANH, LSTM, GRU, BLSTM
        :param L1_reg: the L1 regulasation weight
        :param L2_reg: the L2 regulasation weight
        :param output_type: the activation type of the output layer, by default is 'LINEAR', linear regression.
        :param dropout_rate: probability of dropout, a float number between 0 and 1.
        """

        logger = logging.getLogger("DNN initialization")

        self.n_in = int(n_in)
        self.n_out = int(n_out)

        self.n_layers = len(hidden_layer_size)

        self.dropout_rate = dropout_rate
        self.optimizer = optimizer
        self.loss_function = loss_function
        self.is_train = T.iscalar('is_train')
        self.rnn_batch_training = rnn_batch_training

        assert len(hidden_layer_size) == len(hidden_layer_type)

        self.list_of_activations = [
            'TANH', 'SIGMOID', 'SOFTMAX', 'RELU', 'RESU'
        ]

        if self.rnn_batch_training:
            self.x = T.tensor3('x')
            self.y = T.tensor3('y')
        else:
            self.x = T.matrix('x')
            self.y = T.matrix('y')

        self.L1_reg = L1_reg
        self.L2_reg = L2_reg

        self.rnn_layers = []
        self.params = []
        self.delta_params = []

        rng = np.random.RandomState(123)

        for i in range(self.n_layers):
            if i == 0:
                input_size = n_in
            else:
                input_size = hidden_layer_size[i - 1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.rnn_layers[i - 1].output
                if hidden_layer_type[i - 1] == 'BSLSTM' or hidden_layer_type[
                        i - 1] == 'BLSTM':
                    input_size = hidden_layer_size[i - 1] * 2

            if hidden_layer_type[i] in self.list_of_activations:
                hidden_activation = hidden_layer_type[i].lower()
                hidden_layer = GeneralLayer(rng,
                                            layer_input,
                                            input_size,
                                            hidden_layer_size[i],
                                            activation=hidden_activation,
                                            p=self.dropout_rate,
                                            training=self.is_train)
            elif hidden_layer_type[i] == 'TANH_LHUC':
                hidden_layer = SigmoidLayer_LHUC(rng,
                                                 layer_input,
                                                 input_size,
                                                 hidden_layer_size[i],
                                                 activation=T.tanh,
                                                 p=self.dropout_rate,
                                                 training=self.is_train)
            elif hidden_layer_type[i] == 'SLSTM':
                hidden_layer = SimplifiedLstm(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'SGRU':
                hidden_layer = SimplifiedGRU(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'GRU':
                hidden_layer = GatedRecurrentUnit(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NFG':
                hidden_layer = LstmNFG(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NOG':
                hidden_layer = LstmNOG(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NIG':
                hidden_layer = LstmNIG(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_NPH':
                hidden_layer = LstmNoPeepholes(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM':
                hidden_layer = VanillaLstm(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'BSLSTM':
                hidden_layer = BidirectionSLstm(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'BLSTM':
                hidden_layer = BidirectionLstm(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'RNN':
                hidden_layer = VanillaRNN(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            elif hidden_layer_type[i] == 'LSTM_LHUC':
                hidden_layer = VanillaLstm_LHUC(
                    rng,
                    layer_input,
                    input_size,
                    hidden_layer_size[i],
                    p=self.dropout_rate,
                    training=self.is_train,
                    rnn_batch_training=self.rnn_batch_training)
            else:
                logger.critical(
                    "This hidden layer type: %s is not supported right now! \n Please use one of the following: SLSTM, BSLSTM, TANH, SIGMOID\n"
                    % (hidden_layer_type[i]))
                sys.exit(1)

            self.rnn_layers.append(hidden_layer)
            self.params.extend(hidden_layer.params)

        input_size = hidden_layer_size[-1]
        if hidden_layer_type[-1] == 'BSLSTM' or hidden_layer_type[
                -1] == 'BLSTM':
            input_size = hidden_layer_size[-1] * 2

        output_activation = output_type.lower()
        if output_activation == 'linear':
            self.final_layer = LinearLayer(rng, self.rnn_layers[-1].output,
                                           input_size, self.n_out)
        elif output_activation == 'recurrent':
            self.final_layer = RecurrentOutputLayer(
                rng,
                self.rnn_layers[-1].output,
                input_size,
                self.n_out,
                rnn_batch_training=self.rnn_batch_training)
        elif output_type.upper() in self.list_of_activations:
            self.final_layer = GeneralLayer(rng,
                                            self.rnn_layers[-1].output,
                                            input_size,
                                            self.n_out,
                                            activation=output_activation)
        else:
            logger.critical(
                "This output layer type: %s is not supported right now! \n Please use one of the following: LINEAR, BSLSTM\n"
                % (output_type))
            sys.exit(1)

        self.params.extend(self.final_layer.params)

        self.updates = {}
        for param in self.params:
            self.updates[param] = theano.shared(
                value=np.zeros(param.get_value(borrow=True).shape,
                               dtype=theano.config.floatX),
                name='updates')

        if self.loss_function == 'CCE':
            self.finetune_cost = self.categorical_crossentropy_loss(
                self.final_layer.output, self.y)
            self.errors = self.categorical_crossentropy_loss(
                self.final_layer.output, self.y)
        elif self.loss_function == 'Hinge':
            self.finetune_cost = self.multiclass_hinge_loss(
                self.final_layer.output, self.y)
            self.errors = self.multiclass_hinge_loss(self.final_layer.output,
                                                     self.y)
        elif self.loss_function == 'MMSE':
            if self.rnn_batch_training:
                self.y_mod = T.reshape(self.y, (-1, n_out))
                self.final_layer_output = T.reshape(self.final_layer.output,
                                                    (-1, n_out))

                nonzero_rows = T.any(self.y_mod, 1).nonzero()

                self.y_mod = self.y_mod[nonzero_rows]
                self.final_layer_output = self.final_layer_output[nonzero_rows]

                self.finetune_cost = T.mean(
                    T.sum((self.final_layer_output - self.y_mod)**2, axis=1))
                self.errors = T.mean(
                    T.sum((self.final_layer_output - self.y_mod)**2, axis=1))
            else:
                self.finetune_cost = T.mean(
                    T.sum((self.final_layer.output - self.y)**2, axis=1))
                self.errors = T.mean(
                    T.sum((self.final_layer.output - self.y)**2, axis=1))
Example #58
0
    def test_value_s(self):

        "tests that the value of the kl divergence decreases with each update to s_i "

        model = self.model
        e_step = self.e_step
        X = self.X

        assert X.shape[0] == self.m

        init_H = e_step.init_H_hat(V=X)
        init_Mu1 = e_step.init_S_hat(V=X)

        prev_setting = config.compute_test_value
        config.compute_test_value = 'off'
        H, Mu1 = function([], outputs=[init_H, init_Mu1])()
        config.compute_test_value = prev_setting

        H = broadcast(H, self.m)
        Mu1 = broadcast(Mu1, self.m)

        H = np.cast[config.floatX](self.model.rng.uniform(0., 1., H.shape))
        Mu1 = np.cast[config.floatX](self.model.rng.uniform(
            -5., 5., Mu1.shape))

        H_var = T.matrix(name='H_var')
        H_var.tag.test_value = H
        Mu1_var = T.matrix(name='Mu1_var')
        Mu1_var.tag.test_value = Mu1
        idx = T.iscalar()
        idx.tag.test_value = 0

        S = e_step.infer_S_hat(V=X, H_hat=H_var, S_hat=Mu1_var)

        s_idx = S[:, idx]

        s_i_func = function([H_var, Mu1_var, idx], s_idx)

        sigma0 = 1. / model.alpha
        Sigma1 = e_step.infer_var_s1_hat()
        mu0 = T.zeros_like(model.mu)

        #by truncated KL, I mean that I am dropping terms that don't depend on H and Mu1
        # (they don't affect the outcome of this test and some of them are intractable )
        trunc_kl = - model.entropy_hs(H_hat = H_var, var_s0_hat = sigma0, var_s1_hat = Sigma1) + \
                     model.expected_energy_vhs(V = X, H_hat = H_var, S_hat = Mu1_var, var_s0_hat = sigma0, var_s1_hat = Sigma1)

        trunc_kl_func = function([H_var, Mu1_var], trunc_kl)

        for i in xrange(self.N):
            prev_kl = trunc_kl_func(H, Mu1)

            Mu1[:, i] = s_i_func(H, Mu1, i)

            new_kl = trunc_kl_func(H, Mu1)

            increase = new_kl - prev_kl

            mx = increase.max()

            if mx > 1e-3:
                raise Exception(
                    'after mean field step in s, kl divergence should decrease, but some elements increased by as much as '
                    + str(mx) + ' after updating s_' + str(i))
Example #59
0
from __future__ import print_function
import numpy as np
import theano
import theano.tensor as T

N = T.iscalar('N')


def calc(n, fn1, fn2):
    return fn1 + fn2, fn1


outputs, _ = theano.scan(fn=calc,
                         sequences=T.arange(N),
                         n_steps=N,
                         outputs_info=[1., 1.])

fibonacci = theano.function(inputs=[N], outputs=outputs)

print(fibonacci(8))
Example #60
0
    def build(self,
              dropout,
              char_dim,
              char_lstm_dim,
              char_bidirect,
              slb_dim,
              slb_lstm_dim,
              slb_bidirect,
              word_dim,
              word_lstm_dim,
              word_bidirect,
              lr_method,
              pre_emb,
              crf,
              pos_dim,
              lexicon_dim,
              training=True,
              **kwargs):
        """
        Build the network.
        """
        # Training parameters
        n_words = len(self.id_to_word)
        n_tags = len(self.id_to_tag)

        # Number of features
        if slb_dim:
            n_slbs = len(self.id_to_slb)
        if char_dim:
            n_chars = len(self.id_to_char)
        if pos_dim:
            n_pos = len(self.id_to_pos) + 2
        if lexicon_dim:
            n_lex = lexicon_dim

        # Network variables
        is_train = T.iscalar('is_train')
        word_ids = T.ivector(name='word_ids')
        if slb_dim:
            slb_for_ids = T.imatrix(name='slb_for_ids')
        if slb_lstm_dim:
            slb_rev_ids = T.imatrix(name='slb_rev_ids')
            if slb_bidirect:
                slb_pos_ids = T.ivector(name='slb_pos_ids')
        if char_dim:
            char_for_ids = T.imatrix(name='char_for_ids')
        if char_lstm_dim:
            char_rev_ids = T.imatrix(name='char_rev_ids')
            if char_bidirect:
                char_pos_ids = T.ivector(name='char_pos_ids')
        if pos_dim:
            pos_ids = T.ivector(name='pos_ids')
        if lexicon_dim:
            lex_ids = T.fmatrix(name='lex_ids')
        tag_ids = T.ivector(name='tag_ids')

        # Sentence length
        s_len = (word_ids if word_dim else char_pos_ids).shape[0]

        # Final input (all word features)
        input_dim = 0
        inputs = []

        #
        # Word inputs
        #
        if word_dim:
            input_dim += word_dim
            word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
            word_input = word_layer.link(word_ids)
            inputs.append(word_input)

            # Initialize with pretrained embeddings
            if pre_emb and training:
                new_weights = word_layer.embeddings.get_value()
                print 'Loading pretrained embeddings...'
                pretrained = {}
                emb_invalid = 0
                for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
                    line = line.rstrip().split()
                    if len(line) == word_dim + 1:
                        pretrained[line[0]] = np.array(
                            [float(x) for x in line[1:]]).astype(np.float32)
                    else:
                        emb_invalid += 1
                # if emb_invalid > 0:
                #     print 'WARNING: %i invalid lines' % emb_invalid
                # Lookup table initialization
                for i in xrange(n_words):
                    word = self.id_to_word[i]
                    if word in pretrained:
                        # print word
                        new_weights[i] = pretrained[word]
                word_layer.embeddings.set_value(new_weights)

        #
        # Syllable inputs
        #
        if slb_dim:
            slb_layer = EmbeddingLayer(n_slbs, slb_dim, name='slb_layer')
            if slb_lstm_dim:
                input_dim += slb_lstm_dim
                slb_lstm_for = LSTM(slb_dim,
                                    slb_lstm_dim,
                                    with_batch=True,
                                    name='slb_lstm_for')
                slb_lstm_rev = LSTM(slb_dim,
                                    slb_lstm_dim,
                                    with_batch=True,
                                    name='slb_lstm_rev')
                slb_lstm_for.link(slb_layer.link(slb_for_ids))
                slb_lstm_rev.link(slb_layer.link(slb_rev_ids))
                slb_for_input = slb_lstm_for.h.dimshuffle(
                    (1, 0, 2))[T.arange(s_len), slb_pos_ids]
                slb_rev_input = slb_lstm_rev.h.dimshuffle(
                    (1, 0, 2))[T.arange(s_len), slb_pos_ids]
                inputs.append(slb_for_input)
                if slb_bidirect:
                    inputs.append(slb_rev_input)
                    input_dim += slb_lstm_dim

        #
        # Chars inputs
        #
        if char_dim:
            char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
            if char_lstm_dim:
                input_dim += char_lstm_dim
                char_lstm_for = LSTM(char_dim,
                                     char_lstm_dim,
                                     with_batch=True,
                                     name='char_lstm_for')
                char_lstm_rev = LSTM(char_dim,
                                     char_lstm_dim,
                                     with_batch=True,
                                     name='char_lstm_rev')
                char_lstm_for.link(char_layer.link(char_for_ids))
                char_lstm_rev.link(char_layer.link(char_rev_ids))
                char_for_input = char_lstm_for.h.dimshuffle(
                    (1, 0, 2))[T.arange(s_len), char_pos_ids]
                char_rev_input = char_lstm_rev.h.dimshuffle(
                    (1, 0, 2))[T.arange(s_len), char_pos_ids]
                inputs.append(char_for_input)
                if char_bidirect:
                    inputs.append(char_rev_input)
                    input_dim += char_lstm_dim

        #
        # PoS & Lexicon feature
        #
        if pos_dim:
            input_dim += pos_dim
            pos_layer = EmbeddingLayer(n_pos, pos_dim, name='pos_layer')
            inputs.append(pos_layer.link(pos_ids))

        if lexicon_dim:
            input_dim += lexicon_dim
            lex_layer = HiddenLayer(n_lex,
                                    lexicon_dim,
                                    name='lex_layer',
                                    activation=None)
            inputs.append(lex_layer.link(lex_ids))

        # Prepare final input
        if len(inputs) != 1:
            inputs = T.concatenate(inputs, axis=1)
        else:
            inputs = inputs[0]

        #
        # Dropout on final input
        #
        if dropout:
            dropout_layer = DropoutLayer(p=dropout)
            input_train = dropout_layer.link(inputs)
            input_test = (1 - dropout) * inputs
            inputs = T.switch(T.neq(is_train, 0), input_train, input_test)

        # LSTM for words
        word_lstm_for = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_for')
        word_lstm_rev = LSTM(input_dim,
                             word_lstm_dim,
                             with_batch=False,
                             name='word_lstm_rev')
        word_lstm_for.link(inputs)
        word_lstm_rev.link(inputs[::-1, :])
        word_for_output = word_lstm_for.h
        word_rev_output = word_lstm_rev.h[::-1, :]
        if word_bidirect:
            final_output = T.concatenate([word_for_output, word_rev_output],
                                         axis=1)
            tanh_layer = HiddenLayer(2 * word_lstm_dim,
                                     word_lstm_dim,
                                     name='tanh_layer',
                                     activation='tanh')
            final_output = tanh_layer.link(final_output)
        else:
            final_output = word_for_output

        # Sentence to Named Entity tags - Score
        final_layer = HiddenLayer(word_lstm_dim,
                                  n_tags,
                                  name='final_layer',
                                  activation=(None if crf else 'softmax'))
        tags_scores = final_layer.link(final_output)

        # No CRF
        if not crf:
            cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
        # CRF
        else:
            transitions = shared((n_tags + 2, n_tags + 2), 'transitions')
            small = -1000
            b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32)
            e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32)
            observations = T.concatenate(
                [tags_scores, small * T.ones((s_len, 2))], axis=1)
            observations = T.concatenate([b_s, observations, e_s], axis=0)

            # Score from tags
            real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()

            # Score from transitions
            b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
            e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
            padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
            real_path_score += transitions[padded_tags_ids[T.arange(s_len +
                                                                    1)],
                                           padded_tags_ids[T.arange(s_len + 1)
                                                           + 1]].sum()

            all_paths_scores = forward(observations, transitions)
            cost = -(real_path_score - all_paths_scores)

        # Network parameters
        params = []
        if word_dim:
            self.add_component(word_layer)
            params.extend(word_layer.params)
        if slb_dim:
            self.add_component(slb_layer)
            params.extend(slb_layer.params)
            if slb_lstm_dim:
                self.add_component(slb_lstm_for)
                params.extend(slb_lstm_for.params)
                if slb_bidirect:
                    self.add_component(slb_lstm_rev)
                    params.extend(slb_lstm_rev.params)
        if char_dim:
            self.add_component(char_layer)
            params.extend(char_layer.params)
            if char_lstm_dim:
                self.add_component(char_lstm_for)
                params.extend(char_lstm_for.params)
                if char_bidirect:
                    self.add_component(char_lstm_rev)
                    params.extend(char_lstm_rev.params)
        self.add_component(word_lstm_for)
        params.extend(word_lstm_for.params)
        if word_bidirect:
            self.add_component(word_lstm_rev)
            params.extend(word_lstm_rev.params)
        if pos_dim:
            self.add_component(pos_layer)
            params.extend(pos_layer.params)
        if lexicon_dim:
            self.add_component(lex_layer)
            params.extend(lex_layer.params)
        self.add_component(final_layer)
        params.extend(final_layer.params)
        if crf:
            self.add_component(transitions)
            params.append(transitions)
        if word_bidirect:
            self.add_component(tanh_layer)
            params.extend(tanh_layer.params)

        # Prepare train and eval inputs
        eval_inputs = []
        if word_dim:
            eval_inputs.append(word_ids)
        if slb_dim:
            eval_inputs.append(slb_for_ids)
            if slb_lstm_dim:
                if slb_bidirect:
                    eval_inputs.append(slb_rev_ids)
                eval_inputs.append(slb_pos_ids)
        if char_dim:
            eval_inputs.append(char_for_ids)
            if char_lstm_dim:
                if char_bidirect:
                    eval_inputs.append(char_rev_ids)
                eval_inputs.append(char_pos_ids)
        if pos_dim:
            eval_inputs.append(pos_ids)
        if lexicon_dim:
            eval_inputs.append(lex_ids)
        train_inputs = eval_inputs + [tag_ids]

        # Parse optimization method parameters
        if "-" in lr_method:
            lr_method_name = lr_method[:lr_method.find('-')]
            lr_method_parameters = {}
            for x in lr_method[lr_method.find('-') + 1:].split('-'):
                split = x.split('_')
                assert len(split) == 2
                lr_method_parameters[split[0]] = float(split[1])
        else:
            lr_method_name = lr_method
            lr_method_parameters = {}

        # Compile training function
        print 'Compiling...'
        if training:
            updates = Optimization(clip=5.0).get_updates(
                lr_method_name, cost, params, **lr_method_parameters)
            f_train = theano.function(inputs=train_inputs,
                                      outputs=cost,
                                      updates=updates,
                                      givens=({
                                          is_train: np.cast['int32'](1)
                                      } if dropout else {}))
        else:
            f_train = None

        # Compile evaluation function
        if not crf:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=tags_scores,
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))
        else:
            f_eval = theano.function(inputs=eval_inputs,
                                     outputs=forward(
                                         observations,
                                         transitions,
                                         viterbi=True,
                                         return_alpha=False,
                                         return_best_sequence=True),
                                     givens=({
                                         is_train: np.cast['int32'](0)
                                     } if dropout else {}))

        return f_train, f_eval