def test_path_layer_train():
    init_rng()
    class_num = 3
    trans_mat = np.zeros((class_num, class_num))
    trans_mat[0, 0] = 0.8
    trans_mat[0, 1] = 0.1
    trans_mat[0, 2] = 0.1
    trans_mat[1, 0] = 0.1
    trans_mat[1, 1] = 0.1
    trans_mat[1, 2] = 0.8
    trans_mat[2, 0] = 0.1
    trans_mat[2, 1] = 0.1
    trans_mat[2, 2] = 0.8

    def gen_train(trans_mat, sample_num, class_num):
        X = []
        Y = []
        x = [1.0 / class_num for i in xrange(class_num)]
        pre_y = 0
        for _i in xrange(sample_num):
            X.append(x)
            trans_prob = trans_mat[pre_y]
            y = np.random.choice(class_num, 1, p=trans_prob)[0]
            Y.append(y)
            pre_y = y
        return np.asarray(X, dtype=np.float32), np.asarray(Y, dtype=np.int32)

    X, Y = gen_train(trans_mat, 100, class_num)
    print Counter(Y)
    trans_mat_prior = np.zeros((class_num + 1, class_num))
    trans_mat_prior[0] = X[0]
    trans_mat_prior[1:] = trans_mat
    print "init trains mat"
    print trans_mat_prior
    # layer = PathTransitionLayer(class_num,trans_mat_prior)
    layer = PathTransitionLayer(class_num)
    train_x = T.fmatrix("x")
    train_y = T.ivector("y")
    cost = layer.cost(train_x, train_y)

    params = layer.params()
    gparams = T.grad(cost, params)
    updates = []
    learning_rate = 0.01
    for param, gparam in zip(params, gparams):
        updates.append((param, param - learning_rate * gparam))

    iternum = 10
    for i in xrange(iternum):
        X, Y = gen_train(trans_mat, 100, class_num)
        foo = theano.function(inputs=[train_x, train_y], outputs=[cost], updates=updates)
        print "%d,cost=%f" % (i, foo(X, Y)[0])
        trained_trans_mat = layer.params()[0].eval()
        print trained_trans_mat
    print "training done."
def test_path_transition_layer():

    init_rng()
    sample_num = 5
    class_num = 10

    X = theano.tensor.nnet.softmax(np.random.random((sample_num, class_num)))
    y = (9, 9, 9, 9, 9)

    layer = PathTransitionLayer(class_num)

    cost = layer.cost(X, y).eval()
    y_pred = layer.predict(X).eval()

    y_pred_cost = layer.cost(X, y_pred).eval()

    print "optimized cost = ", cost
    print "y_pred = ", y_pred
    print "y_pred_cost", y_pred_cost

    assert y_pred_cost < cost

    y_score = 1000
    logadd_score = 0.0

    trans_mat = theano.tensor.nnet.softmax(layer.tag_trans_matrix).eval()
    X = X.eval()
    for path in itertools.product(range(class_num), repeat=sample_num):
        score = trans_mat[0, path[0]] + X[0, path[0]]
        for idx in range(1, sample_num):
            score += trans_mat[path[idx - 1] + 1, path[idx]] + X[idx, path[idx]]

        score = score  # .eval()
        logadd_score += math.exp(score)

        if path == y:
            y_score = score
    logadd_score = math.log(logadd_score)

    bruteforce_cost = logadd_score - y_score

    print "bruteforce cost = {0} with logadd = {1} and selected_path_score = {2}".format(
        bruteforce_cost, logadd_score, y_score
    )

    bruteforce_y_pred = np.argmax(X, axis=1)  # because trans_mat is const matrix

    print "brueforce y_pred = ", bruteforce_y_pred

    assert math.fabs(bruteforce_cost - cost) < 1e-6

    assert not np.any(y_pred - bruteforce_y_pred)
def test_path_transition_layer2():

    init_rng()
    sample_num = 5
    class_num = 10

    y1 = (9, 9, 9, 9, 9)
    X1 = np.zeros((sample_num, class_num))
    X1[range(sample_num), y1] = 1

    y2 = (0, 1, 2, 3, 4)
    X2 = np.zeros((sample_num, class_num))
    X2[range(sample_num), y2] = 1

    layer = PathTransitionLayer(class_num)
    cost1 = layer.cost(X1, y1).eval()
    cost2 = layer.cost(X2, y2).eval()

    cost1_2 = layer.cost(X1, y2).eval()
    cost2_1 = layer.cost(X2, y1).eval()

    y_pred1 = layer.predict(X1).eval()
    y_pred2 = layer.predict(X2).eval()

    print "X1 = ", X1
    print "X2 = ", X2

    print "y1 = ", y1
    print "y2 = ", y2

    print "cost1 = ", cost1
    print "cost2 = ", cost2
    print "cost1_2 = ", cost1_2
    print "cost2_1 = ", cost2_1
    print "y_pred1 = ", y_pred1
    print "y_pred2 = ", y_pred2
    def __init__(self,rng,x,y,sent_length,masks,
            model_params):
        # x shpae: (batch_size,max_term_per_sent+3,max_sentence_length)
        # ,where  max_sentence_length = max_term_per_sent + window_size - 1
        self.L1_reg = model_params['L1_reg']
        self.L2_reg = model_params['L2_reg']

        self.x = x
        self.y = y
        self.sent_length = sent_length
        self.masks = masks

        self.max_sentence_length = model_params['max_sentence_length']
        self.window_size = model_params['window_size']
        self.max_term_per_sent = self.max_sentence_length - self.window_size + 1
        self.word_num = model_params['word_num']
        self.POS_num = model_params['POS_num']
        self.verbpos_num = model_params['verbpos_num']
        self.wordpos_num = model_params['wordpos_num']

        self.word_feature_num = model_params['word_feature_num']
        self.POS_feature_num = model_params['POS_feature_num']
        self.wordpos_feature_num = model_params['wordpos_feature_num']
        self.verbpos_feature_num = model_params['verbpos_feature_num']

        self.conv_window = model_params['conv_window']
        self.conv_hidden_feature_num = model_params['conv_hidden_feature_num']

        self.hidden_layer_size = model_params['hidden_layer_size']
        self.tags_num = model_params['tags_num']

        # we have 4 lookup tables here:
        # 1,word vector
        #   output shape: (batch size,1,max_sentence_length * word_feature_num)
        # 2,POS tag vector
        #   output shape: (batch size,1,max_sentence_length * POS_feature_num)
        # 3,verb position vector
        #   output shape: (batch size,1,max_sentence_length * verbpos_feature_num)
        # 4,word position vector
        #   output shape: (batch size,max_term_per_sent,1,max_sentence_length * wordpos_feature_num)
        self.wordvec = LookupTableLayer(inputs = x[:,0:1,:], table_size = self.word_num,
                window_size = self.max_sentence_length, feature_num = self.word_feature_num,
                reshp = (x.shape[0],1,1,x.shape[2] * self.word_feature_num))

        self.POSvec = LookupTableLayer(inputs = x[:,1:2,:], table_size = self.POS_num,
                window_size = self.max_sentence_length, feature_num = self.POS_feature_num,
                reshp = (x.shape[0],1,1,x.shape[2] * self.POS_feature_num))

        self.verbpos_vec = LookupTableLayer(inputs = x[:,2:3,:], table_size = self.verbpos_num,
                window_size = self.max_sentence_length, feature_num = self.verbpos_feature_num,
                reshp = (x.shape[0],1,1,x.shape[2] * self.verbpos_feature_num))

        self.wordpos_vec = LookupTableLayer(inputs = x[:,3:,:], table_size = self.wordpos_num,
                window_size = self.max_sentence_length, feature_num = self.wordpos_feature_num,
                reshp = (x.shape[0],self.max_term_per_sent,1,x.shape[2] * self.wordpos_feature_num))

        # conv_word.out.shape = (batch_size,1,conv_hidden_feature_num,max_sentence_length-conv_window+1)
        # conv_POS.out.shape = (batch_size,1,conv_hidden_feature_num,max_sentence_length-conv_window+1)
        # conv_verbpos.out.shape = (batch_size,1,conv_hidden_feature_num,max_sentence_length-conv_window+1)
        # conv_wordpos.out.shape = (batch_size,max_sentence_length,conv_hidden_feature_num,max_sentence_length-conv_window+1)
        # note. all output above have been seted 'dimshuffle'
        self.conv_word = Conv1DLayer('conv_word',rng,self.wordvec.output,\
                self.conv_hidden_feature_num,1,self.conv_window,self.word_feature_num)

        self.conv_POS = Conv1DLayer('conv_POS',rng,self.POSvec.output,\
                self.conv_hidden_feature_num,1,self.conv_window,self.POS_feature_num)

        self.conv_verbpos = Conv1DLayer('conv_verbpos',rng,self.verbpos_vec.output,\
                self.conv_hidden_feature_num,1,self.conv_window,self.verbpos_feature_num)

        self.conv_wordpos = Conv1DLayer('conv_wordpos',rng,self.wordpos_vec.output,\
                self.conv_hidden_feature_num,self.max_term_per_sent,self.conv_window,self.wordpos_feature_num)


        # the first max_sentence_length means each element of it is one prediction for that word
        # the second max_sentence_length means each element of it is one output of conv
        # conv_out shape: (batch_size,max_term_per_sent,conv_hidden_feature_num,max_term_per_sent)
        self.conv_out = self.conv_word.output + self.conv_POS.output + self.conv_verbpos.output + self.conv_wordpos.output
        self.conv_out = self.conv_out.dimshuffle(1,0,2,3,4).reshape((x.shape[0],self.max_term_per_sent,self.conv_hidden_feature_num,-1))

        # max_out shape: (batch_size,max_term_per_sent,conv_hidden_feature_num)
        self.max_out = T.max(self.conv_out,axis=3).reshape((self.conv_out.shape[0],self.max_term_per_sent,-1))


        # hidden layer
        # hidden layer perform one linear map and one nolinear transform
        # ,then in likelihood, it performs another linear map. This is 
        # what senna do (P.7, figure 1).
        # hidden_layer OUTPUT SHAPE: (batch_size, max_term_per_sent, hidden_layer_size)
        self.hidden_layer = HiddenLayer(rng=rng, input=self.max_out,
                n_in = self.conv_hidden_feature_num,
                n_out = self.hidden_layer_size,
                activation=T.tanh)

        # TODO we use poitwise likelihood here
        self.sentce_loglikelihood = PathTransitionLayer(rng,self.hidden_layer.output,
                self.y,self.masks,
                self.max_term_per_sent,
                self.hidden_layer_size,
                self.tags_num)

        self._likelihood = self.sentce_loglikelihood.negative_log_likelihood_pointwise()
        self._errors = self.sentce_loglikelihood.errors()