def ff(self, x):
        self.check_dimension(x)
  
        n_ins,_   = x.shape
        n_layer   = len(self.w)
        tmp       = x

        for i in xrange(n_layer):
            if i == 0 and sp.isspmatrix(x):
                tmp = tmp * self.w[i]
            else:
                tmp  = np.dot(tmp, self.w[i])
            tmp += np.tile(self.b[i], [n_ins,1] )
            tmp  = active( tmp, self.hidden_active) 
 
        #print tmp.shape
        #print self.lw.shape
        output  = np.dot( tmp, self.lw ) 
        output += np.tile(self.lb,[n_ins, 1])  
        #output  = active( output, self.output_active )
        #output   = np.zeros((n_ins,self.num_label))
        
        return output 
    def ff(self, x):
        self.check_dimension(x)

        n_ins, _ = x.shape
        n_layer = len(self.w)
        tmp = x

        for i in xrange(n_layer):
            if i == 0 and sp.isspmatrix(x):
                tmp = tmp * self.w[i]
            else:
                tmp = np.dot(tmp, self.w[i])
            tmp += np.tile(self.b[i], [n_ins, 1])
            tmp = active(tmp, self.hidden_active)

        #print tmp.shape
        #print self.lw.shape
        output = np.dot(tmp, self.lw)
        output += np.tile(self.lb, [n_ins, 1])
        #output  = active( output, self.output_active )
        #output  = np.zeros((n_ins,self.num_label))

        return output
    def bp(self, x, y, idx):

        
        #import cProfile, pstats, StringIO
        #pr =  cProfile.Profile()
        #pr.enable()
        self.check_dimension(x, y)
        #-------------------------------------------------------
        #ff for train 
        #-------------------------------------------------------
        #compute the instance factor
        hidden_output = []
        n,d           = x.shape
        n_layer       = len(self.w)
        #print "n_layer",n_layer
        hidden        = []
        tmp           = x
        for i in xrange( n_layer ):
            if 0 == i and sp.isspmatrix(x):#type(x) == type(sp.csr_matrix([[0]])):
                tmp = tmp * self.w[i]
            else:
                tmp = np.dot( tmp, self.w[i] )
            tmp += np.tile(self.b[i], [n,1] )
            tmp  = active( tmp, self.hidden_active )
            hidden_output.append(tmp)
        ins_factor = tmp

        xy    = idx.nonzero()
        mo,no = idx.shape
        #print "sparity of label", len(xy[0]) * 1.0 / (mo * no)
        if len(xy[0]) * 1.0 / (mo * no) < self.sparse_thr:
            row = ins_factor[xy[0],:]
            col = self.lw[:,xy[1]]
            data  = np.einsum('ik,ki->i',row,col)
            data += self.lb[xy[1]]
	   
            #data = np.zeros(len(xy[0]))            
            #for k in xrange(len(xy[0])):
            #    i = xy[0][k]
            #    j = xy[1][k]
            #    data[k] = np.dot(ins_factor[i,:], self.lw[:,j]) + self.lb[j]
            
            output = sp.csr_matrix((data,xy), idx.shape)

        else: 
            output = np.zeros(idx.shape) 
            for k in xrange(len(xy[0])):
                i = xy[0][k]
                j = xy[1][k]
                output[i,j]  = np.dot( ins_factor[i:i+1, :], self.lw[:, j:j+1]) \
                               + self.lb[j]
        output  = active( output, self.output_active )         

        #---------------------------------------------------
        #compute the grad
        #---------------------------------------------------
        grad_type    = self.output_active \
                       + "_" + self.loss
        output_grad  = grad( output, y, grad_type )
        #print "type(output_grad",type(output_grad),sp.isspmatrix(output_grad) 

        num_rates, _ = idx.shape 
        if sp.isspmatrix(output_grad):
            #print "enter"
            self.grad_lw  = sp.csr_matrix(np.transpose(ins_factor)) * output_grad
            self.grad_lb  = np.asarray(output_grad.sum(0))[0,:]
            #print "sp.issmatrix(self.grad_lw)", sp.isspmatrix(self.grad_lw)
        else:
            self.grad_lw  = self.lw - self.lw
            self.grad_lb  = self.lb - self.lb
            xy = idx.nonzero()
            for k in xrange(len(xy[0])):
                i = xy[0][k]   
                j = xy[1][k]
                self.grad_lw[:,j] += output_grad[i,j] * ins_factor[i,:]
                self.grad_lb[j]   += output_grad[i,j]
        self.grad_lw /= num_rates
        self.grad_lb /= num_rates
	#self.grad_lw = np.asarray(self.grad_lw.todense())
	#self.grad_lb = self.grad_lb.todense()

        ## compute grad of instance factor
        xy = idx.nonzero()
        ins_factor_grad = ins_factor - ins_factor
        #ins_factor_grad = np.zeros(ins_factor.shape)
        if sp.isspmatrix(output_grad):
            for i,j,v in zip(xy[0], xy[1], output_grad.data):
                ins_factor_grad[i,:] += v * self.lw[:,j]
        
        else:
            for k in xrange(len(xy[0])):
                i = xy[0][k]
                j = xy[1][k]
                ins_factor_grad[i,:] += output_grad[i,j] * self.lw[:, j]

       # self.grad_lw = np.asarray(self.grad_lw.todense())
       

        #import cProfile, pstats, StringIO
        #pr =  cProfile.Profile()
        #pr.enable()
        tmp = ins_factor_grad
        for i in xrange( len(self.w) - 1, -1, -1):
            tmp = tmp * grad(tmp, grad_type =  self.hidden_active )
            t1, t2 = x.shape
            if 0 == i and sp.isspmatrix(x):
                #print "sparity of input", len(x.nonzero()[0]) * 1.0 / (t1 * t2)
                if len(x.nonzero()[0]) * 1.0 /(t1 * t2) < self.sparse_thr:
                    self.grad_w[i] = np.transpose(x) * sp.csr_matrix(tmp) / num_rates
                    #self.grad_w[i] = np.asarray(self.grad_w[i].todense())
                    self.grad_w[i] = self.grad_w[i].tocsr()
                else:
                    self.grad_w[i] = np.transpose(x) * tmp / num_rates
                    #self.grad_w[i] = self.grad_w[i].tocsr()
            elif 0 == i:
                self.grad_w[i] = np.dot( np.transpose(x), tmp ) / num_rates
            else:
                self.grad_w[i] = np.dot( np.transpose(hidden_output[i-1]), tmp )\
                                 / num_rates
     
            self.grad_b[i] = np.sum(tmp, 0) / num_rates
            if 0 == i:  continue
            tmp = np.dot( tmp, np.transpose(self.w[i]) )
    def bp(self, x, y, idx):

        #import cProfile, pstats, StringIO
        #pr =  cProfile.Profile()
        #pr.enable()
        self.check_dimension(x, y)
        #-------------------------------------------------------
        #ff for train
        #-------------------------------------------------------
        #compute the instance factor
        hidden_output = []
        n, d = x.shape
        n_layer = len(self.w)
        #print "n_layer",n_layer
        hidden = []
        tmp = x
        for i in xrange(n_layer):
            if 0 == i and sp.isspmatrix(
                    x):  #type(x) == type(sp.csr_matrix([[0]])):
                tmp = tmp * self.w[i]
            else:
                tmp = np.dot(tmp, self.w[i])
            tmp += np.tile(self.b[i], [n, 1])
            tmp = active(tmp, self.hidden_active)
            hidden_output.append(tmp)
        ins_factor = tmp

        if None == idx:
            output = np.dot(ins_factor, self.lw)
        else:
            xy = idx.nonzero()
            mo, no = idx.shape
            #print "sparity of label", len(xy[0]) * 1.0 / (mo * no)
            if len(xy[0]) * 1.0 / (mo * no) < self.sparse_thr:
                row = ins_factor[xy[0], :]
                col = self.lw[:, xy[1]]
                data = np.einsum('ik,ki->i', row, col)
                data += self.lb[xy[1]]

                #data = np.zeros(len(xy[0]))
                #for k in xrange(len(xy[0])):
                #    i = xy[0][k]
                #    j = xy[1][k]
                #    data[k] = np.dot(ins_factor[i,:], self.lw[:,j]) + self.lb[j]

                output = sp.csr_matrix((data, xy), idx.shape)

            else:
                output = np.zeros(idx.shape)
                for k in xrange(len(xy[0])):
                    i = xy[0][k]
                    j = xy[1][k]
                    output[i,j]  = np.dot( ins_factor[i:i+1, :], self.lw[:, j:j+1]) \
                               + self.lb[j]

        output = active(output, self.output_active, idx)

        #---------------------------------------------------
        #compute the grad
        #---------------------------------------------------
        #grad_type    = self.output_active \
        #               + "_" + self.loss

        grad_type = actlo2grad(self.output_active, self.loss)
        output_grad = grad(output, y, grad_type)
        #print "type(output_grad",type(output_grad),sp.isspmatrix(output_grad)

        num_rates, _ = y.shape
        if sp.isspmatrix(output_grad):
            #print "enter"
            self.grad_lw = sp.csr_matrix(
                np.transpose(ins_factor)) * output_grad
            self.grad_lb = np.asarray(output_grad.sum(0))[0, :]
            #print "sp.issmatrix(self.grad_lw)", sp.isspmatrix(self.grad_lw)
        else:
            if None == idx:
                self.grad_lw = np.asarray(
                    np.dot(np.transpose(ins_factor), output_grad))
                self.grad_lb = np.asarray(output_grad.sum(0))[0, :]
            else:
                self.grad_lw = self.lw - self.lw
                self.grad_lb = self.lb - self.lb
                xy = idx.nonzero()
                for k in xrange(len(xy[0])):
                    i = xy[0][k]
                    j = xy[1][k]
                    self.grad_lw[:, j] += output_grad[i, j] * ins_factor[i, :]
                    self.grad_lb[j] += output_grad[i, j]
        self.grad_lw /= num_rates
        self.grad_lb /= num_rates
        #self.grad_lw = np.asarray(self.grad_lw.todense())
        #self.grad_lb = self.grad_lb.todense()

        ## compute grad of instance factor
        if sp.isspmatrix(output_grad):
            ins_factor_grad = ins_factor - ins_factor
            xy = output_grad.nonzero()
            for i, j, v in zip(xy[0], xy[1], output_grad.data):
                ins_factor_grad[i, :] += v * self.lw[:, j]

        else:
            if None == idx:
                ins_factor_grad = np.asarray(
                    np.dot(output_grad, np.transpose(self.lw)))
            else:
                ins_factor_grad = ins_factor - ins_factor
                xy = idx.nonzero()
                for k in xrange(len(xy[0])):
                    i = xy[0][k]
                    j = xy[1][k]
                    ins_factor_grad[i, :] += output_grad[i, j] * self.lw[:, j]

    # self.grad_lw = np.asarray(self.grad_lw.todense())

    #import cProfile, pstats, StringIO
    #pr =  cProfile.Profile()
    #pr.enable()

        tmp = ins_factor_grad
        for i in xrange(len(self.w) - 1, -1, -1):
            tmp = tmp * grad(tmp, grad_type=self.hidden_active)
            t1, t2 = x.shape
            if 0 == i and sp.isspmatrix(x):
                #print "sparity of input", len(x.nonzero()[0]) * 1.0 / (t1 * t2)
                if len(x.nonzero()[0]) * 1.0 / (t1 * t2) < self.sparse_thr:
                    self.grad_w[i] = np.transpose(x) * sp.csr_matrix(
                        tmp) / num_rates
                    #self.grad_w[i] = np.asarray(self.grad_w[i].todense())
                    self.grad_w[i] = self.grad_w[i].tocsr()
                else:
                    self.grad_w[i] = np.transpose(x) * tmp / num_rates
                    #self.grad_w[i] = self.grad_w[i].tocsr()
            elif 0 == i:
                self.grad_w[i] = np.dot(np.transpose(x), tmp) / num_rates
            else:
                self.grad_w[i] = np.dot( np.transpose(hidden_output[i-1]), tmp )\
                                 / num_rates

            self.grad_b[i] = np.sum(tmp, 0) / num_rates
            if 0 == i: continue
            tmp = np.dot(tmp, np.transpose(self.w[i]))