Exemple #1
0
 def __init__(self, **kwargs):
     # init parent attributes
     GRU.__init__(self, **kwargs)
     with tf.name_scope('IAGRU%d' % self.back_wards):
         self.M_qz = tf.Variable(
             tf.truncated_normal(shape=[self.hidden_size, self.hidden_size], stddev=self.init_scale),
             name='M_qz')
         self.M_qr = tf.Variable(
             tf.truncated_normal(shape=[self.hidden_size, self.hidden_size], stddev=self.init_scale),
             name='M_qr')
     self.attention = tf.Variable(tf.zeros([1, self.hidden_size]), trainable=False)
Exemple #2
0
 def __init__(self, **kwargs):
     # init parent attributes
     GRU.__init__(self, **kwargs)
     with tf.name_scope('IAGRU%d' % self.back_wards):
         self.M_qz = tf.Variable(tf.truncated_normal(
             shape=[self.hidden_size, self.hidden_size],
             stddev=self.init_scale),
                                 name='M_qz')
         self.M_qr = tf.Variable(tf.truncated_normal(
             shape=[self.hidden_size, self.hidden_size],
             stddev=self.init_scale),
                                 name='M_qr')
     self.attention = tf.Variable(tf.zeros([1, self.hidden_size]),
                                  trainable=False)
Exemple #3
0
def predict(datasets,
        U,                       # pre-trained word embeddings
        n_epochs=5,batch_size=20,max_l = 100,hidden_size=100,word_embedding_size=100,
        session_hidden_size=50,session_input_size =50, model_name = 'SMN_last.bin'):          # for optimization
    """
    return: a list of dicts of lists, each list contains (ansId, groundTruth, prediction) for a question
    """
    hiddensize = hidden_size
    U = U.astype(dtype=theano.config.floatX)
    rng = np.random.RandomState(3435)
    lsize, rsize = max_l,max_l
    sessionmask = T.matrix()
    lx = []
    lxmask = []
    for i in range(max_turn):
        lx.append(T.matrix())
        lxmask.append(T.matrix())

    index = T.lscalar()
    rx = T.matrix('rx')
    rxmask = T.matrix()
    y = T.ivector('y')
    Words = theano.shared(value = U, name = "Words")
    llayer0_input = []
    for i in range(max_turn):
        llayer0_input.append(Words[T.cast(lx[i].flatten(),dtype="int32")]\
            .reshape((lx[i].shape[0],lx[i].shape[1],Words.shape[1])))

    rlayer0_input = Words[T.cast(rx.flatten(),dtype="int32")].reshape((rx.shape[0],rx.shape[1],Words.shape[1]))     # input: word embeddings of the mini batch


    train_set, dev_set, test_set = datasets[0], datasets[1], datasets[2]

    train_set_lx = []
    train_set_lx_mask = []
    q_embedding = []
    offset = 2 * lsize
    for i in range(max_turn):
        train_set_lx.append(theano.shared(np.asarray(train_set[:,offset*i:offset*i + lsize]
                                                   ,dtype=theano.config.floatX),borrow=True))
        train_set_lx_mask.append(theano.shared(np.asarray(train_set[:,offset*i + lsize:offset*i + 2*lsize]
                                                        ,dtype=theano.config.floatX),borrow=True))
    train_set_rx = theano.shared(np.asarray(train_set[:,offset*max_turn:offset*max_turn + lsize]
                                                   ,dtype=theano.config.floatX),borrow=True)
    train_set_rx_mask= theano.shared(np.asarray(train_set[:,offset*max_turn +lsize:offset*max_turn +2 *lsize]
                                                        ,dtype=theano.config.floatX),borrow=True)
    train_set_session_mask= theano.shared(np.asarray(train_set[:,-max_turn-1:-1]
                                                        ,dtype=theano.config.floatX),borrow=True)
    train_set_y =theano.shared(np.asarray(train_set[:,-1],dtype="int32"),borrow=True)

    val_set_lx = []
    val_set_lx_mask = []
    for i in range(max_turn):
        val_set_lx.append(theano.shared(np.asarray(dev_set[:,offset*i:offset*i + lsize]
                                                   ,dtype=theano.config.floatX),borrow=True))
        val_set_lx_mask.append(theano.shared(np.asarray(dev_set[:,offset*i + lsize:offset*i + 2*lsize]
                                                        ,dtype=theano.config.floatX),borrow=True))

    val_set_rx = theano.shared(np.asarray(dev_set[:,offset*max_turn:offset*max_turn + lsize],dtype=theano.config.floatX),borrow=True)
    val_set_rx_mask = theano.shared(np.asarray(dev_set[:,offset*max_turn +lsize:offset*max_turn +2 *lsize],dtype=theano.config.floatX),borrow=True)
    val_set_session_mask = theano.shared(np.asarray(dev_set[:,-max_turn-1:-1]
                                                    ,dtype=theano.config.floatX),borrow=True)
    val_set_y =theano.shared(np.asarray(dev_set[:,-1],dtype="int32"),borrow=True)

    dic = {}
    for i in range(max_turn):
        dic[lx[i]] = train_set_lx[i][index*batch_size:(index+1)*batch_size]
        dic[lxmask[i]] = train_set_lx_mask[i][index*batch_size:(index+1)*batch_size]
    dic[rx] = train_set_rx[index*batch_size:(index+1)*batch_size]
    dic[sessionmask] = train_set_session_mask[index*batch_size:(index+1)*batch_size]
    dic[rxmask] = train_set_rx_mask[index*batch_size:(index+1)*batch_size]
    dic[y] = train_set_y[index*batch_size:(index+1)*batch_size]

    val_dic = {}
    for i in range(max_turn):
        val_dic[lx[i]] = val_set_lx[i][index*batch_size:(index+1)*batch_size]
        val_dic[lxmask[i]] = val_set_lx_mask[i][index*batch_size:(index+1)*batch_size]
    val_dic[rx] = val_set_rx[index*batch_size:(index+1)*batch_size]
    val_dic[sessionmask] = val_set_session_mask[index*batch_size:(index+1)*batch_size]
    val_dic[rxmask] = val_set_rx_mask[index*batch_size:(index+1)*batch_size]
    val_dic[y] = val_set_y[index*batch_size:(index+1)*batch_size]


    sentence2vec = GRU(n_in=word_embedding_size,n_hidden=hiddensize,n_out=hiddensize)

    for i in range(max_turn):
        q_embedding.append(sentence2vec(llayer0_input[i],lxmask[i],True))
    r_embedding = sentence2vec(rlayer0_input,rxmask,True)

    pooling_layer = ConvSim(rng,max_l,session_input_size,hidden_size=hiddensize)

    poolingoutput = []


    for i in range(max_turn):
        poolingoutput.append(pooling_layer(llayer0_input[i],rlayer0_input,
                                           q_embedding[i],r_embedding))

    session2vec = GRU(n_in=session_input_size,n_hidden=session_hidden_size,n_out=session_hidden_size)
    res = session2vec(T.stack(poolingoutput,1),sessionmask,True)

    W = theano.shared(ortho_weight(50),borrow = True)
    W2 = theano.shared(glorot_uniform((100,50)),borrow=True)
    b = theano.shared(value=np.zeros((50,),dtype='float32'),borrow=True)
    U_s = theano.shared(glorot_uniform((50,1)),borrow = True)

    final = T.dot(T.tanh(T.dot(res,W) + T.dot(T.stack(q_embedding,1)[:,:,-1,:],W2) + b),U_s)

    weight = T.exp(T.max(final,2)) * sessionmask
    weight2 = weight / T.sum(weight,1)[:,None]

    final2 = T.sum(res *weight2[:,:,None],1)

    classifier = LogisticRegression(final2, session_hidden_size, 2, rng)


    test = theano.function([index], final2
    ,givens=val_dic,on_unused_input='ignore')
    print test(0).shape
    print test(0)

    cost = classifier.negative_log_likelihood(y)
    error = classifier.errors(y)
    opt = Adam()
    params = classifier.params
    params += sentence2vec.params
    params += session2vec.params
    params += pooling_layer.params
    params += [Words,W,b,W2,U_s]

    load_params(params,model_name)

    predict = classifier.predict_prob

    val_model = theano.function([index], [y,predict,cost,error], givens=val_dic
                                ,on_unused_input='ignore')
    f = open('result.txt','w')
    loss = 0.
    for minibatch_index in xrange(datasets[1].shape[0]/batch_size):
        a,b,c,d = val_model(minibatch_index)
        print c
        loss += c
        #print b.shape
        for i in range(batch_size):
            f.write(str(b[i][1]))
            f.write('\t')
            f.write(str(a[i]))
            f.write('\n')
            #print b[i]
    print loss/(datasets[1].shape[0]/batch_size)
Exemple #4
0
def train(datasets,
        U,                       # pre-trained word embeddings
        n_epochs=5,batch_size=20,max_l = 100,hidden_size=100,word_embedding_size=100,
        session_hidden_size=50,session_input_size =50, model_name = 'SMN_last.bin'):
    hiddensize = hidden_size
    U = U.astype(dtype=theano.config.floatX)
    rng = np.random.RandomState(3435)
    lsize, rsize = max_l,max_l
    sessionmask = T.matrix()
    lx = []
    lxmask = []
    for i in range(max_turn):
        lx.append(T.matrix())
        lxmask.append(T.matrix())

    index = T.lscalar()
    rx = T.matrix('rx')
    rxmask = T.matrix()
    y = T.ivector('y')
    Words = theano.shared(value = U, name = "Words")
    llayer0_input = []
    for i in range(max_turn):
        llayer0_input.append(Words[T.cast(lx[i].flatten(),dtype="int32")]\
            .reshape((lx[i].shape[0],lx[i].shape[1],Words.shape[1])))

    rlayer0_input = Words[T.cast(rx.flatten(),dtype="int32")].reshape((rx.shape[0],rx.shape[1],Words.shape[1]))     # input: word embeddings of the mini batch


    train_set, dev_set, test_set = datasets[0], datasets[1], datasets[2]

    train_set_lx = []
    train_set_lx_mask = []
    q_embedding = []
    offset = 2 * lsize
    for i in range(max_turn):
        train_set_lx.append(theano.shared(np.asarray(train_set[:,offset*i:offset*i + lsize]
                                                   ,dtype=theano.config.floatX),borrow=True))
        train_set_lx_mask.append(theano.shared(np.asarray(train_set[:,offset*i + lsize:offset*i + 2*lsize]
                                                        ,dtype=theano.config.floatX),borrow=True))
    train_set_rx = theano.shared(np.asarray(train_set[:,offset*max_turn:offset*max_turn + lsize]
                                                   ,dtype=theano.config.floatX),borrow=True)
    train_set_rx_mask= theano.shared(np.asarray(train_set[:,offset*max_turn +lsize:offset*max_turn +2 *lsize]
                                                        ,dtype=theano.config.floatX),borrow=True)
    train_set_session_mask= theano.shared(np.asarray(train_set[:,-max_turn-1:-1]
                                                        ,dtype=theano.config.floatX),borrow=True)
    train_set_y =theano.shared(np.asarray(train_set[:,-1],dtype="int32"),borrow=True)

    val_set_lx = []
    val_set_lx_mask = []
    for i in range(max_turn):
        val_set_lx.append(theano.shared(np.asarray(dev_set[:,offset*i:offset*i + lsize]
                                                   ,dtype=theano.config.floatX),borrow=True))
        val_set_lx_mask.append(theano.shared(np.asarray(dev_set[:,offset*i + lsize:offset*i + 2*lsize]
                                                        ,dtype=theano.config.floatX),borrow=True))

    val_set_rx = theano.shared(np.asarray(dev_set[:,offset*max_turn:offset*max_turn + lsize],dtype=theano.config.floatX),borrow=True)
    val_set_rx_mask = theano.shared(np.asarray(dev_set[:,offset*max_turn +lsize:offset*max_turn +2 *lsize],dtype=theano.config.floatX),borrow=True)
    val_set_session_mask = theano.shared(np.asarray(dev_set[:,-max_turn-1:-1]
                                                    ,dtype=theano.config.floatX),borrow=True)
    val_set_y =theano.shared(np.asarray(dev_set[:,-1],dtype="int32"),borrow=True)

    dic = {}
    for i in range(max_turn):
        dic[lx[i]] = train_set_lx[i][index*batch_size:(index+1)*batch_size]
        dic[lxmask[i]] = train_set_lx_mask[i][index*batch_size:(index+1)*batch_size]
    dic[rx] = train_set_rx[index*batch_size:(index+1)*batch_size]
    dic[sessionmask] = train_set_session_mask[index*batch_size:(index+1)*batch_size]
    dic[rxmask] = train_set_rx_mask[index*batch_size:(index+1)*batch_size]
    dic[y] = train_set_y[index*batch_size:(index+1)*batch_size]

    val_dic = {}
    for i in range(max_turn):
        val_dic[lx[i]] = val_set_lx[i][index*batch_size:(index+1)*batch_size]
        val_dic[lxmask[i]] = val_set_lx_mask[i][index*batch_size:(index+1)*batch_size]
    val_dic[rx] = val_set_rx[index*batch_size:(index+1)*batch_size]
    val_dic[sessionmask] = val_set_session_mask[index*batch_size:(index+1)*batch_size]
    val_dic[rxmask] = val_set_rx_mask[index*batch_size:(index+1)*batch_size]
    val_dic[y] = val_set_y[index*batch_size:(index+1)*batch_size]


    sentence2vec = GRU(n_in=word_embedding_size,n_hidden=hiddensize,n_out=hiddensize)

    for i in range(max_turn):
        q_embedding.append(sentence2vec(llayer0_input[i],lxmask[i],True))
    r_embedding = sentence2vec(rlayer0_input,rxmask,True)

    pooling_layer = ConvSim(rng,max_l,session_input_size,hidden_size=hiddensize)

    poolingoutput = []


    for i in range(max_turn):
        poolingoutput.append(pooling_layer(llayer0_input[i],rlayer0_input,
                                           q_embedding[i],r_embedding))

    session2vec = GRU(n_in=session_input_size,n_hidden=session_hidden_size,n_out=session_hidden_size)
    res = session2vec(T.stack(poolingoutput,1),sessionmask,True)

    W = theano.shared(ortho_weight(50),borrow = True)
    W2 = theano.shared(glorot_uniform((100,50)),borrow=True)
    b = theano.shared(value=np.zeros((50,),dtype='float32'),borrow=True)
    U_s = theano.shared(glorot_uniform((50,1)),borrow = True)

    final = T.dot(T.tanh(T.dot(res,W) + T.dot(T.stack(q_embedding,1)[:,:,-1,:],W2) + b),U_s)

    weight = T.exp(T.max(final,2)) * sessionmask
    weight2 = weight / T.sum(weight,1)[:,None]

    final2 = T.sum(res *weight2[:,:,None],1)

    classifier = LogisticRegression(final2, session_hidden_size, 2, rng)


    test = theano.function([index], final2
    ,givens=val_dic,on_unused_input='ignore')
    print test(0).shape
    print test(0)

    cost = classifier.negative_log_likelihood(y)
    error = classifier.errors(y)
    opt = Adam()
    params = classifier.params
    params += sentence2vec.params
    params += session2vec.params
    params += pooling_layer.params
    params += [Words,W,b,W2,U_s]

    grad_updates = opt.Adam(cost=cost,params=params,lr = 0.001) #opt.sgd_updates_adadelta(params, cost, lr_decay, 1e-8, sqr_norm_lim)

    train_model = theano.function([index], cost,updates=grad_updates, givens=dic,on_unused_input='ignore')
    val_model = theano.function([index], [cost,error], givens=val_dic,on_unused_input='ignore')
    best_dev = 1.
    n_train_batches = datasets[0].shape[0]/batch_size
    for i in xrange(n_epochs):
        cost = 0
        total = 0.
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            batch_cost = train_model(minibatch_index)
            total = total + 1
            cost = cost + batch_cost
            if total % 50 == 0:
                print total, cost/total
        cost = cost / n_train_batches
        print "echo %d loss %f" % (i,cost)

        cost=0
        errors = 0
        j = 0
        for minibatch_index in xrange(datasets[1].shape[0]/batch_size):
            tcost, terr = val_model(minibatch_index)
            cost += tcost
            errors += terr
            j = j+1
        cost = cost / j
        errors = errors / j
        if cost < best_dev:
            best_dev = cost
            save_params(params,model_name)
        print  "echo %d dev_loss %f" % (i,cost)
        print  "echo %d dev_accuracy %f" % (i,1 - errors)
Exemple #5
0
def train(datasets,
          U,
          n_epochs=5,
          batch_size=20,
          max_l=50,
          hidden_size=200,
          word_embedding_size=200,
          session_hidden_size=50,
          session_input_size=50,
          model_name='SMN_last.bin'):
    # 设置hiddensize,lsize,rsize,初始化,定义符号化编程的步骤
    hiddensize = hidden_size
    U = U.astype(dtype=theano.config.floatX)
    rng = np.random.RandomState(3435)
    lsize, rsize = max_l, max_l
    sessionmask = T.matrix()
    lx = []
    lxmask = []
    for i in range(max_turn):
        lx.append(T.matrix())
        lxmask.append(T.matrix())

    index = T.lscalar()
    rx = T.matrix('rx')
    rxmask = T.matrix()
    y = T.ivector('y')
    Words = theano.shared(value=U, name="Words")
    """
    第一层GRU的输入,llayer0_input和rlayer0_input
    llayer0_input是多轮对话构成的词级的向量,list类型,len=10,每一条是(200,50,200)的矩阵;
    rlayer0_input是答句构成的词级的向量,(200,50,200)的矩阵
    """
    llayer0_input = []
    for i in range(max_turn):
        llayer0_input.append(Words[T.cast(lx[i].flatten(),
                                          dtype="int32")].reshape(
                                              (lx[i].shape[0], lx[i].shape[1],
                                               Words.shape[1])))

    rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape(
        (rx.shape[0], rx.shape[1], Words.shape[1]))

    # 从datasets中获取训练集,验证集和测试集
    train_set, dev_set, test_set = datasets[0], datasets[1], datasets[2]

    train_set_lx = []
    train_set_lx_mask = []
    q_embedding = []
    offset = 2 * lsize
    """
    划分训练集的lx和rx,即train_set_lx,train_set_rx
    划分lxmask,rxmask,sessionmask和y
    """
    for i in range(max_turn):
        print(offset * i, offset * i + lsize)
        train_set_lx.append(
            theano.shared(np.asarray(train_set[:,
                                               offset * i:offset * i + lsize],
                                     dtype=theano.config.floatX),
                          borrow=True))
        train_set_lx_mask.append(
            theano.shared(np.asarray(train_set[:, offset * i +
                                               lsize:offset * i + 2 * lsize],
                                     dtype=theano.config.floatX),
                          borrow=True))

    train_set_rx = theano.shared(np.asarray(
        train_set[:, offset * max_turn:offset * max_turn + lsize],
        dtype=theano.config.floatX),
                                 borrow=True)
    train_set_rx_mask = theano.shared(np.asarray(
        train_set[:, offset * max_turn + lsize:offset * max_turn + 2 * lsize],
        dtype=theano.config.floatX),
                                      borrow=True)
    train_set_session_mask = theano.shared(np.asarray(
        train_set[:, -max_turn - 1:-1], dtype=theano.config.floatX),
                                           borrow=True)
    train_set_y = theano.shared(np.asarray(train_set[:, -1], dtype="int32"),
                                borrow=True)

    val_set_lx = []
    val_set_lx_mask = []
    """
    验证集同训练集
    """
    for i in range(max_turn):
        val_set_lx.append(
            theano.shared(np.asarray(dev_set[:, offset * i:offset * i + lsize],
                                     dtype=theano.config.floatX),
                          borrow=True))
        val_set_lx_mask.append(
            theano.shared(np.asarray(dev_set[:, offset * i + lsize:offset * i +
                                             2 * lsize],
                                     dtype=theano.config.floatX),
                          borrow=True))

    val_set_rx = theano.shared(np.asarray(
        dev_set[:, offset * max_turn:offset * max_turn + lsize],
        dtype=theano.config.floatX),
                               borrow=True)
    val_set_rx_mask = theano.shared(np.asarray(
        dev_set[:, offset * max_turn + lsize:offset * max_turn + 2 * lsize],
        dtype=theano.config.floatX),
                                    borrow=True)
    val_set_session_mask = theano.shared(np.asarray(
        dev_set[:, -max_turn - 1:-1], dtype=theano.config.floatX),
                                         borrow=True)
    val_set_y = theano.shared(np.asarray(dev_set[:, -1], dtype="int32"),
                              borrow=True)
    """
    构造训练集字典输入
    """
    dic = {}
    for i in range(max_turn):
        dic[lx[i]] = train_set_lx[i][index * batch_size:(index + 1) *
                                     batch_size]
        dic[lxmask[i]] = train_set_lx_mask[i][index * batch_size:(index + 1) *
                                              batch_size]
    dic[rx] = train_set_rx[index * batch_size:(index + 1) * batch_size]
    dic[sessionmask] = train_set_session_mask[index * batch_size:(index + 1) *
                                              batch_size]
    dic[rxmask] = train_set_rx_mask[index * batch_size:(index + 1) *
                                    batch_size]
    dic[y] = train_set_y[index * batch_size:(index + 1) * batch_size]
    """
    构造验证集字典输入
    """
    val_dic = {}
    for i in range(max_turn):
        val_dic[lx[i]] = val_set_lx[i][index * batch_size:(index + 1) *
                                       batch_size]
        val_dic[lxmask[i]] = val_set_lx_mask[i][index *
                                                batch_size:(index + 1) *
                                                batch_size]
    val_dic[rx] = val_set_rx[index * batch_size:(index + 1) * batch_size]
    val_dic[sessionmask] = val_set_session_mask[index *
                                                batch_size:(index + 1) *
                                                batch_size]
    val_dic[rxmask] = val_set_rx_mask[index * batch_size:(index + 1) *
                                      batch_size]
    val_dic[y] = val_set_y[index * batch_size:(index + 1) * batch_size]
    """
    第一次GRU,q_embedding和r_embedding是输出的结果
    q_embedding是多轮对话构成的句子级的向量,list类型,len=10,每一条是(200,50,200)的矩阵;
    rlayer0_input是答句构成的句子级的向量,(200,50,200)的矩阵
    """
    sentence2vec = GRU(n_in=word_embedding_size,
                       n_hidden=hiddensize,
                       n_out=hiddensize)

    for i in range(max_turn):
        q_embedding.append(sentence2vec(llayer0_input[i], lxmask[i], True))
    r_embedding = sentence2vec(rlayer0_input, rxmask, True)

    pooling_layer = ConvSim(rng,
                            max_l,
                            session_input_size,
                            hidden_size=hiddensize)
    """
    卷积,输入是llayer0_input和rlayer0_input,q_embedding和r_embedding
    过程是llayer0_input中的每一条数据和rlayer0_input做卷积
    q_embedding中的每一条数据和r_embedding做卷积,合并结果
    输出是poolingoutput,list类型,len=10,每一条是(200,50)的矩阵;
    """
    poolingoutput = []

    for i in range(max_turn):
        poolingoutput.append(
            pooling_layer(llayer0_input[i], rlayer0_input, q_embedding[i],
                          r_embedding))

    con_test = theano.function([index],
                               pooling_layer(llayer0_input[0], rlayer0_input,
                                             q_embedding[0], r_embedding),
                               givens=dic,
                               on_unused_input='ignore')
    print con_test(0).shape
    sys.exit()
    """
    第二次GRU,输入是poolingoutput
    输出是res,(200,50)的矩阵,用一个50维的向量表示一个session
    """
    session2vec = GRU(n_in=session_input_size,
                      n_hidden=session_hidden_size,
                      n_out=session_hidden_size)

    res = session2vec(T.stack(poolingoutput, 1), sessionmask)
    """
    逻辑回归分类:输入是res,标签是y
    损失是negative_log_likelihood
    """
    classifier = LogisticRegression(res, session_hidden_size, 2, rng)

    cost = classifier.negative_log_likelihood(y)
    error = classifier.errors(y)

    opt = Adam()
    params = classifier.params
    params += sentence2vec.params
    params += session2vec.params
    params += pooling_layer.params
    params += [Words]

    grad_updates = opt.Adam(cost=cost, params=params, lr=0.001)

    # 训练模型的定义
    train_model = theano.function([index],
                                  cost,
                                  updates=grad_updates,
                                  givens=dic,
                                  on_unused_input='ignore')

    # 验证模型的定义
    val_model = theano.function([index], [cost, error],
                                givens=val_dic,
                                on_unused_input='ignore')
    best_dev = 1.

    n_train_batches = datasets[0].shape[0] / batch_size
    """
    开始灌数据输入,n_epochs是训练次数
    """
    for i in xrange(n_epochs):
        cost = 0
        total = 0.
        # 训练的代码
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            batch_cost = train_model(minibatch_index)
            print "minibatch_index"
            print minibatch_index
            total = total + 1
            cost = cost + batch_cost
            if total % 2 == 0:
                print total, cost / total
        cost = cost / n_train_batches
        print "echo %d loss %f" % (i, cost)

        # 验证的代码
        cost = 0
        errors = 0
        j = 0
        for minibatch_index in xrange(datasets[1].shape[0] / batch_size):
            tcost, terr = val_model(minibatch_index)
            cost += tcost
            errors += terr
            j = j + 1
        cost = cost / j
        errors = errors / j
        if cost < best_dev:
            best_dev = cost
            save_params(params, model_name)
        print "echo %d dev_loss %f" % (i, cost)
        print "echo %d dev_accuracy %f" % (i, 1 - errors)