Ejemplo n.º 1
0
 def ApplyActivation(self):
     state = self.state
     if self.activation == deepnet_pb2.Hyperparams.LOGISTIC:
         cm.sigmoid(state)
     elif self.activation == deepnet_pb2.Hyperparams.TANH:
         cm.tanh(state)
     elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR:
         state.greater_than(0, target=self.temp)
         state.mult(self.temp)
     elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR_SMOOTH:
         cm.log_1_plus_exp(state)
     elif self.activation == deepnet_pb2.Hyperparams.LINEAR:
         pass
     elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX:
         state.max(axis=0, target=self.temp)
         state.add_row_mult(self.temp, -1)
         cm.exp(state)
         state.sum(axis=0, target=self.temp)
         self.temp.reciprocal()
         state.mult_by_row(self.temp)
     elif self.activation == deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX:
         state.max(axis=0, target=self.temp)
         state.add_row_mult(self.temp, -1)
         cm.exp(state)
         state.sum(axis=0, target=self.temp)
         self.NN.divide(self.temp, target=self.temp)
         state.mult_by_row(self.temp)
     else:
         raise Exception("Unknown activation")
Ejemplo n.º 2
0
def maskedSingleSoftmax(netInputs, tempMatrix, sMask, notSMask, onesCol,
                        tempRow):
    """
    We now assume we have a single k way softmax and some number of
    Gaussian units.  So we only want to apply the softmax activation
    function to the first k rows of netInputs.
    """
    assert (onesCol.shape[0] == netInputs.shape[0])
    assert (tempRow.shape[1] == netInputs.shape[1])
    assert (tempRow.shape[0] == onesCol.shape[1])
    assert (netInputs.shape == tempMatrix.shape == sMask.shape ==
            notSMask.shape)

    c = num.finfo(num.float32).min / 16
    assert (num.exp(c + 200) == 0.0)
    netInputs.mult(sMask, target=tempMatrix)
    tempMatrix.add_mult(notSMask, c)
    tempMatrix.max(axis=0, target=tempRow)
    #onesCol.assign_scalar(1)
    tempMatrix.subtract_dot(onesCol, tempRow)
    cm.exp(tempMatrix)
    tempMatrix.sum(axis=0, target=tempRow)
    tempRow.reciprocal()
    tempMatrix.mult_by_row(tempRow)
    netInputs.mult(notSMask)
    tempMatrix.mult(sMask)
    netInputs.add(tempMatrix)
Ejemplo n.º 3
0
def ExactZ_binary_binary(model):
    assert len(model.layer) == 2, 'Only implemented for RBMs.'
    steps = len(schedule)
    input_layer = model.layer[0]
    hidden_layer = model.layer[1]
    edge = model.edge[0]
    w = edge.params['weight']
    a = hidden_layer.params['bias']
    b = input_layer.params['bias']
    numvis, numhid = w.shape
    batchsize = 2**numvis
    input_layer.AllocateBatchsizeDependentMemory(batchsize)
    hidden_layer.AllocateBatchsizeDependentMemory(batchsize)
    all_inputs = GetAll(numvis)
    w_ais = cm.CUDAMatrix(np.zeros((1, batchsize)))
    input_layer.sample.overwrite(all_inputs)
    cm.dot(w.T, input_layer.sample, target=hidden_layer.state)
    hidden_layer.state.add_col_vec(a)
    cm.log_1_plus_exp(hidden_layer.state)
    w_ais.add_sums(hidden_layer.state, axis=0)
    w_ais.add_dot(b.T, input_layer.state)
    offset = float(w_ais.asarray().max())
    w_ais.subtract(offset)
    cm.exp(w_ais)
    z = offset + np.log(w_ais.asarray().sum())
    return z
Ejemplo n.º 4
0
def maskedSingleSoftmax(netInputs, tempMatrix, sMask, notSMask, onesCol, tempRow):
    """
    We now assume we have a single k way softmax and some number of
    Gaussian units.  So we only want to apply the softmax activation
    function to the first k rows of netInputs.
    """
    assert(onesCol.shape[0] == netInputs.shape[0])
    assert(tempRow.shape[1] == netInputs.shape[1])
    assert(tempRow.shape[0] == onesCol.shape[1])
    assert(netInputs.shape == tempMatrix.shape == sMask.shape == notSMask.shape)
    
    c = num.finfo(num.float32).min/16
    assert(num.exp(c+200) == 0.0)
    netInputs.mult(sMask, target = tempMatrix)
    tempMatrix.add_mult(notSMask, c)
    tempMatrix.max(axis = 0, target = tempRow)
    #onesCol.assign_scalar(1)
    tempMatrix.subtract_dot(onesCol, tempRow)
    cm.exp(tempMatrix)
    tempMatrix.sum(axis = 0, target = tempRow)
    tempRow.reciprocal()
    tempMatrix.mult_by_row(tempRow)
    netInputs.mult(notSMask)
    tempMatrix.mult(sMask)
    netInputs.add(tempMatrix)
Ejemplo n.º 5
0
    def log(self, x):
        self.tmp1.assign(x)

        self.tmp1.mult(-1)
        cm.exp(self.tmp1, target=self.tmp1)
        self.tmp1.add(1)
        cm.pow(self.tmp1, -1)
Ejemplo n.º 6
0
Archivo: ais.py Proyecto: ANB2/deepnet
def ExactZ_binary_binary(model):
  assert len(model.layer) == 2, 'Only implemented for RBMs.'
  steps = len(schedule)
  input_layer = model.layer[0]
  hidden_layer = model.layer[1]
  edge = model.edge[0]
  w = edge.params['weight']
  a = hidden_layer.params['bias']
  b = input_layer.params['bias']
  numvis, numhid = w.shape
  batchsize = 2**numvis
  input_layer.AllocateBatchsizeDependentMemory(batchsize)
  hidden_layer.AllocateBatchsizeDependentMemory(batchsize)
  all_inputs = GetAll(numvis)
  w_ais = cm.CUDAMatrix(np.zeros((1, batchsize)))
  input_layer.sample.overwrite(all_inputs)
  cm.dot(w.T, input_layer.sample, target=hidden_layer.state)
  hidden_layer.state.add_col_vec(a)
  cm.log_1_plus_exp(hidden_layer.state)
  w_ais.add_sums(hidden_layer.state, axis=0)
  w_ais.add_dot(b.T, input_layer.state)
  offset = float(w_ais.asarray().max())
  w_ais.subtract(offset)
  cm.exp(w_ais)
  z = offset + np.log(w_ais.asarray().sum())
  return z
Ejemplo n.º 7
0
    def compute_output(self,gpu_data):
        """
        Computes p(y|x). Puts the result in self.gpu_p_y_given_x.
        """
        
        cm.dot(self.W,gpu_data,self.gpu_act_from_x)
        self.gpu_act_from_x.add_col_vec(self.c)
        for c in range(self.n_classes):
            cm.dot(self.U,self.gpu_target_vectors.slice(c,c+1),self.gpu_act_from_y)
            # to avoid memory creation, using gpu_h
            # and gpu_h_sample for these computations
            self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y,target=self.gpu_h)
            cm.exp(self.gpu_h,self.gpu_h_sample)
            self.gpu_h_sample.add(1.)
            cm.log(self.gpu_h_sample,self.gpu_h)
            self.gpu_h.sum(axis=0,target=self.gpu_negative_free_energy_for_y)
            cm.dot(self.d.T,self.gpu_target_vectors.slice(c,c+1),target=self.gpu_bias_from_y)
            self.gpu_negative_free_energy_for_y.add_col_vec(self.gpu_bias_from_y)
            self.gpu_negative_free_energy_for_y.transpose(target=self.gpu_negative_free_energy.slice(c,c+1))
        # Subtracting mean for more stable softmax computation
        self.gpu_negative_free_energy.sum(axis=1,target=self.gpu_mean_negative_free_energy)
        self.gpu_mean_negative_free_energy.divide(-self.n_classes)
        self.gpu_negative_free_energy.add_col_vec(self.gpu_mean_negative_free_energy)

        cm.exp(self.gpu_negative_free_energy,target=self.gpu_negative_free_energy)
        self.gpu_negative_free_energy.sum(axis=1,target=self.gpu_p_y_given_x_norm)
        for c in range(self.n_classes):
            self.gpu_negative_free_energy.slice(c,c+1).divide(self.gpu_p_y_given_x_norm,
                                                              target=self.gpu_p_y_given_x.slice(c,c+1))
        self.gpu_p_y_given_x.transpose(target=self.gpu_p_y_given_x_trans)
Ejemplo n.º 8
0
 def ApplyActivation(self):
     state = self.state
     if self.activation == deepnet_pb2.Hyperparams.LOGISTIC:
         cm.sigmoid(state)
     elif self.activation == deepnet_pb2.Hyperparams.TANH:
         cm.tanh(state)
     elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR:
         state.greater_than(0, target=self.temp)
         state.mult(self.temp)
     elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR_SMOOTH:
         cm.log_1_plus_exp(state)
     elif self.activation == deepnet_pb2.Hyperparams.LINEAR:
         pass
     elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX:
         state.max(axis=0, target=self.temp)
         state.add_row_mult(self.temp, -1)
         cm.exp(state)
         state.sum(axis=0, target=self.temp)
         self.temp.reciprocal()
         state.mult_by_row(self.temp)
     elif self.activation == deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX:
         state.max(axis=0, target=self.temp)
         state.add_row_mult(self.temp, -1)
         cm.exp(state)
         state.sum(axis=0, target=self.temp)
         self.NN.divide(self.temp, target=self.temp)
         state.mult_by_row(self.temp)
     else:
         raise Exception('Unknown activation')
Ejemplo n.º 9
0
    def HMCSample(self, hActs=None):
        if hActs == None:
            hActs = self.hActs

        epsilon = self.hmcStepSize
        if self.stepSizeIsMean:
            epsilon = -self.hmcStepSize * num.log(1.0 - num.random.rand())

        self.negVis.assign(self.vis)
        #sample a velocity and temporal direction
        self.vel.fill_with_randn()
        timeDir = 2 * num.random.randint(2) - 1

        self.Hamiltonian(self.prevHamil)

        #half-step
        self.acceleration()  #updates self.accel
        self.vel.add_mult(self.accel, -0.5 * timeDir * epsilon)
        self.negVis.add_mult(self.vel, timeDir * epsilon)
        #full leap-frog steps
        for s in range(self.hmcSteps - 1):
            self.acceleration()
            self.vel.add_mult(self.accel, -timeDir * epsilon)
            self.negVis.add_mult(self.vel, timeDir * epsilon)
        #final half-step
        self.acceleration()
        self.vel.add_mult(self.accel, -0.5 * timeDir * epsilon)
        self.negVis.add_mult(self.vel, timeDir * epsilon)

        self.Hamiltonian(self.hamil)

        #compute rejections
        self.prevHamil.subtract(
            self.hamil, target=self.thresh
        )  #don't really need this new variable, but it is small
        cm.exp(self.thresh)
        self.tempRow.fill_with_rand()
        self.tempRow.less_than(
            self.thresh, target=self.tempRow
        )  #tempRow entries are 0 for reject and 1 for accept
        self.tempRow.copy_to_host()
        rejRate = self.tempRow.numpy_array.sum() / float(self.mbsz)
        rejRate = 1 - rejRate
        self.negVis.mult_by_row(self.tempRow)  #zero out rejected columns
        negate(
            self.tempRow)  #tempRow entries are 1 for reject and 0 for accept
        self.vis.mult_by_row(self.tempRow, target=self.tempVisMB)
        self.negVis.add(self.tempVisMB)

        smoothing = 0.9
        self.runningAvRej = smoothing * self.runningAvRej + (
            1.0 - smoothing) * rejRate
        tol = 0.05
        #perhaps add this in later? right now the step size HAS to change unless it hits a max or min
        #if self.runningAvRej < self.targetRejRate*(1-tol) or self.runningAvRej < self.targetRejRate*(1+tol):
        #    pass
        if self.runningAvRej < self.targetRejRate:
            self.hmcStepSize = min(self.hmcStepSize * 1.01, self.maxStepSize)
        else:
            self.hmcStepSize = max(self.hmcStepSize * 0.99, self.minStepSize)
Ejemplo n.º 10
0
    def costAndGrad(self,data,labels):
        
        batchSize = data.shape[1]
        self.setViews(batchSize)
        
        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in self.stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)
            if i <= len(self.layerSizes):
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)
        self.deltasC.assign(cm.CUDAMatrix(deltas))

        if skip:
            return cost,self.grad,skip

        # back prop
        nl = len(self.layerSizes)
        i = nl 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(self.stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=self.grad[i][0])
            deltasIn.sum(axis=1,target=self.grad[i][1])

            # compute next layer deltas
            if i > 0:
                self.hActs[i].sign(target=self.tmpGrad)
                cm.dot(w.T,deltasIn,target=deltasOut)
                deltasOut.mult(self.tmpGrad)

            if i == nl:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
Ejemplo n.º 11
0
    def HMCSample(self, hActs = None):
        if hActs == None:
            hActs = self.hActs

        epsilon = self.hmcStepSize
        if self.stepSizeIsMean:
            epsilon = -self.hmcStepSize*num.log(1.0-num.random.rand())
        
        self.negVis.assign(self.vis)
        #sample a velocity and temporal direction
        self.vel.fill_with_randn()
        timeDir = 2*num.random.randint(2)-1
        
        self.Hamiltonian(self.prevHamil)
        
        #half-step
        self.acceleration() #updates self.accel
        self.vel.add_mult(self.accel, -0.5*timeDir*epsilon)
        self.negVis.add_mult(self.vel, timeDir*epsilon)
        #full leap-frog steps
        for s in range(self.hmcSteps-1):
            self.acceleration()
            self.vel.add_mult(self.accel, -timeDir*epsilon)
            self.negVis.add_mult(self.vel, timeDir*epsilon)
        #final half-step
        self.acceleration()
        self.vel.add_mult(self.accel, -0.5*timeDir*epsilon)
        self.negVis.add_mult(self.vel, timeDir*epsilon)
        
        self.Hamiltonian(self.hamil)
        
        #compute rejections
        self.prevHamil.subtract(self.hamil, target = self.thresh) #don't really need this new variable, but it is small
        cm.exp(self.thresh)
        self.tempRow.fill_with_rand()
        self.tempRow.less_than(self.thresh, target = self.tempRow) #tempRow entries are 0 for reject and 1 for accept
        self.tempRow.copy_to_host()
        rejRate = self.tempRow.numpy_array.sum()/float(self.mbsz)
        rejRate = 1-rejRate
        self.negVis.mult_by_row(self.tempRow) #zero out rejected columns
        negate(self.tempRow) #tempRow entries are 1 for reject and 0 for accept
        self.vis.mult_by_row(self.tempRow, target = self.tempVisMB)
        self.negVis.add(self.tempVisMB)

        smoothing = 0.9
        self.runningAvRej = smoothing*self.runningAvRej + (1.0-smoothing)*rejRate
        tol = 0.05
        #perhaps add this in later? right now the step size HAS to change unless it hits a max or min
        #if self.runningAvRej < self.targetRejRate*(1-tol) or self.runningAvRej < self.targetRejRate*(1+tol):
        #    pass
        if self.runningAvRej < self.targetRejRate:
            self.hmcStepSize = min(self.hmcStepSize*1.01, self.maxStepSize)
        else:
            self.hmcStepSize = max(self.hmcStepSize*0.99, self.minStepSize)
Ejemplo n.º 12
0
def logOnePlusExp(x, temp, targ = None):
    """
    When this function is done, x should contain log(1+exp(x)).  We
    clobber the value of temp.  We compute log(1+exp(x)) as x +
    log(1+exp(-x)), which will hopefully be more finite-precision
    friendly.
    """
    assert(x.shape == temp.shape)
    x.mult(-1, target = temp)
    cm.exp(temp)
    temp.add(1)
    cm.log(temp)
    x.add(temp, target = targ)
Ejemplo n.º 13
0
def logOnePlusExp(x, temp, targ=None):
    """
    When this function is done, x should contain log(1+exp(x)).  We
    clobber the value of temp.  We compute log(1+exp(x)) as x +
    log(1+exp(-x)), which will hopefully be more finite-precision
    friendly.
    """
    assert (x.shape == temp.shape)
    x.mult(-1, target=temp)
    cm.exp(temp)
    temp.add(1)
    cm.log(temp)
    x.add(temp, target=targ)
Ejemplo n.º 14
0
    def negative_free_energy(self, gpu_data):
        """
        Computes the negative free-energy.
        Outputs a reference to a pre-allocated GPU variable
        containing the result.
        """

        cm.dot(self.W, gpu_data, self.gpu_h)
        self.gpu_h.add_col_vec(self.c)
        # to avoid memory creation, using gpu_h
        # and gpu_h_sample for these computations
        cm.exp(self.gpu_h, self.gpu_h_sample)
        self.gpu_h_sample.add(1.)
        cm.log(self.gpu_h_sample, self.gpu_h)
        self.gpu_h.sum(axis=0, target=self.gpu_negative_free_energy)
        self.gpu_negative_free_energy.add_dot(self.b.T, gpu_data)
        return self.gpu_negative_free_energy
Ejemplo n.º 15
0
    def negative_free_energy(self,gpu_data):
        """
        Computes the negative free-energy.
        Outputs a reference to a pre-allocated GPU variable
        containing the result.
        """

        cm.dot(self.W,gpu_data,self.gpu_h)
        self.gpu_h.add_col_vec(self.c)
        # to avoid memory creation, using gpu_h
        # and gpu_h_sample for these computations
        cm.exp(self.gpu_h,self.gpu_h_sample)
        self.gpu_h_sample.add(1.)
        cm.log(self.gpu_h_sample,self.gpu_h)
        self.gpu_h.sum(axis=0,target=self.gpu_negative_free_energy)
        self.gpu_negative_free_energy.add_dot(self.b.T,gpu_data)
        return self.gpu_negative_free_energy
Ejemplo n.º 16
0
def test_exp():
    m = 256
    n = 128
    a = np.array(np.random.randn(m, n), dtype=np.float32, order='F')
    b = np.array(np.random.randn(m, n), dtype=np.float32, order='F')

    c = np.exp(a)

    m1 = cm.CUDAMatrix(a)
    m2 = cm.CUDAMatrix(b)
    cm.exp(m1, target = m2)
    cm.exp(m1)

    m1.copy_to_host()
    m2.copy_to_host()

    assert np.max(np.abs(c - m1.numpy_array)) < 10**-4, "Error in cudamat.exp exceeded threshold"
    assert np.max(np.abs(c - m2.numpy_array)) < 10**-4, "Error in cudamat.exp exceeded threshold"
Ejemplo n.º 17
0
def test_exp():
    m = 256
    n = 128
    a = np.array(np.random.randn(m, n), dtype=np.float32, order='F')
    b = np.array(np.random.randn(m, n), dtype=np.float32, order='F')

    c = np.exp(a)

    m1 = cm.CUDAMatrix(a)
    m2 = cm.CUDAMatrix(b)
    cm.exp(m1, target = m2)
    cm.exp(m1)

    m1.copy_to_host()
    m2.copy_to_host()

    assert np.max(np.abs(c - m1.numpy_array)) < 10**-4, "Error in cudamat.exp exceeded threshold"
    assert np.max(np.abs(c - m2.numpy_array)) < 10**-4, "Error in cudamat.exp exceeded threshold"
Ejemplo n.º 18
0
def singleSoftmax(netInputs, tempRow):
    """
    We modify netInputs in place to hold the softmax activation
    probabilities and compute them in a numerically stable way.
    """
    #assert(tempCol.shape[0] == netInputs.shape[0])
    #assert(tempRow.shape[0] == tempCol.shape[1])
    assert (tempRow.shape[1] == netInputs.shape[1])

    netInputs.max(axis=0, target=tempRow)
    #these two lines should be faster than the two below them and let us remove the tempCol param
    tempRow.mult(-1)
    netInputs.add_row_vec(tempRow)
    #tempCol.assign_scalar(1)
    #netInputs.subtract_dot(tempCol, tempRow)
    cm.exp(netInputs)
    netInputs.sum(axis=0, target=tempRow)
    tempRow.reciprocal()
    netInputs.mult_by_row(tempRow)
Ejemplo n.º 19
0
 def compute_energy_mcRBM_visual(self, data, normdata, energy, VF, FH,
                                 bias_cov, bias_vis, w_mean, bias_mean, t1,
                                 t2, t6, feat, featsq, feat_mean, length,
                                 lengthsq, normcoeff, small, num_vis):
     # normalize input data vectors
     data.mult(data, target=t6)  # DxP (nr input dims x nr samples)
     t6.sum(axis=0, target=lengthsq)  # 1xP
     lengthsq.mult(0.5,
                   target=energy)  # energy of quadratic regularization term
     lengthsq.mult(1. /
                   num_vis)  # normalize by number of components (like std)
     lengthsq.add(small)  # small prevents division by 0
     cmt.sqrt(lengthsq, target=length)
     length.reciprocal(target=normcoeff)  # 1xP
     data.mult_by_row(normcoeff, target=normdata)  # normalized data
     ## potential
     # covariance contribution
     cmt.dot(VF.T, normdata, target=feat)  # HxP (nr factors x nr samples)
     feat.mult(feat, target=featsq)  # HxP
     cmt.dot(FH.T, featsq, target=t1)  # OxP (nr cov hiddens x nr samples)
     t1.mult(-0.5)
     t1.add_col_vec(bias_cov)  # OxP
     cmt.exp(t1)  # OxP
     t1.add(1, target=t2)  # OxP
     cmt.log(t2)
     t2.mult(-1)
     energy.add_sums(t2, axis=0)
     # mean contribution
     cmt.dot(w_mean.T, data,
             target=feat_mean)  # HxP (nr mean hiddens x nr samples)
     feat_mean.add_col_vec(bias_mean)  # HxP
     cmt.exp(feat_mean)
     feat_mean.add(1)
     cmt.log(feat_mean)
     feat_mean.mult(-1)
     energy.add_sums(feat_mean, axis=0)
     # visible bias term
     data.mult_by_col(bias_vis, target=t6)
     t6.mult(-1)  # DxP
     energy.add_sums(t6, axis=0)  # 1xP
     # kinetic
     data.mult(data, target=t6)
     energy.add_sums(t6, axis=0, mult=.5)
Ejemplo n.º 20
0
def singleSoftmax(netInputs, tempRow):
    """
    We modify netInputs in place to hold the softmax activation
    probabilities and compute them in a numerically stable way.
    """
    #assert(tempCol.shape[0] == netInputs.shape[0])
    #assert(tempRow.shape[0] == tempCol.shape[1])
    assert(tempRow.shape[1] == netInputs.shape[1])
    
    netInputs.max(axis = 0, target = tempRow)
    #these two lines should be faster than the two below them and let us remove the tempCol param
    tempRow.mult(-1)
    netInputs.add_row_vec(tempRow)
    #tempCol.assign_scalar(1)
    #netInputs.subtract_dot(tempCol, tempRow)
    cm.exp(netInputs)
    netInputs.sum(axis = 0, target = tempRow)
    tempRow.reciprocal()
    netInputs.mult_by_row(tempRow)
Ejemplo n.º 21
0
def softmax(eta):
    #temp = cm.empty((eta.shape[0],1))
    temp = cm.empty((1,eta.shape[1]))
    # this is considered to be potential numerical problem
    if True:
        eta.max(axis = 0, target = temp)
        #print eta.shape
        #print temp.shape
        temp.mult(-1)
        eta.add_row_vec(temp)
        cm.exp(eta)
        eta.sum(axis = 0, target = temp)
        temp.reciprocal()
        eta.mult_by_row(temp)
#    else:
#        cm.exp(eta)
#        eta.sum(axis = 0, target = temp)
#        temp.reciprocal()
#        eta.mult_by_col(temp)
        
Ejemplo n.º 22
0
    def compute_output(self, gpu_data):
        """
        Computes p(y|x). Puts the result in self.gpu_p_y_given_x.
        """

        cm.dot(self.W, gpu_data, self.gpu_act_from_x)
        self.gpu_act_from_x.add_col_vec(self.c)
        for c in range(self.n_classes):
            cm.dot(self.U, self.gpu_target_vectors.slice(c, c + 1),
                   self.gpu_act_from_y)
            # to avoid memory creation, using gpu_h
            # and gpu_h_sample for these computations
            self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y,
                                            target=self.gpu_h)
            cm.exp(self.gpu_h, self.gpu_h_sample)
            self.gpu_h_sample.add(1.)
            cm.log(self.gpu_h_sample, self.gpu_h)
            self.gpu_h.sum(axis=0, target=self.gpu_negative_free_energy_for_y)
            cm.dot(self.d.T,
                   self.gpu_target_vectors.slice(c, c + 1),
                   target=self.gpu_bias_from_y)
            self.gpu_negative_free_energy_for_y.add_col_vec(
                self.gpu_bias_from_y)
            self.gpu_negative_free_energy_for_y.transpose(
                target=self.gpu_negative_free_energy.slice(c, c + 1))
        # Subtracting mean for more stable softmax computation
        self.gpu_negative_free_energy.sum(
            axis=1, target=self.gpu_mean_negative_free_energy)
        self.gpu_mean_negative_free_energy.divide(-self.n_classes)
        self.gpu_negative_free_energy.add_col_vec(
            self.gpu_mean_negative_free_energy)

        cm.exp(self.gpu_negative_free_energy,
               target=self.gpu_negative_free_energy)
        self.gpu_negative_free_energy.sum(axis=1,
                                          target=self.gpu_p_y_given_x_norm)
        for c in range(self.n_classes):
            self.gpu_negative_free_energy.slice(c, c + 1).divide(
                self.gpu_p_y_given_x_norm,
                target=self.gpu_p_y_given_x.slice(c, c + 1))
        self.gpu_p_y_given_x.transpose(target=self.gpu_p_y_given_x_trans)
Ejemplo n.º 23
0
def project_words_gpu(projection_matrix, similarity_matrix, kernel_name,
                      hyperparam):
    import cudamat as cm
    if kernel_name == "poly":
        k = cm.pow(cm.CUDAMatrix(similarity_matrix), hyperparam)
    elif kernel_name == 'rbf':
        k = cm.exp((cm.pow(cm.CUDAMatrix(1 - similarity_matrix),
                           2)).mult(-hyperparam))
    else:
        raise NotImplementedError(f'{kernel_name} not yet implemented for GPU')

    return cm.dot(k, cm.CUDAMatrix(projection_matrix)).asarray()
Ejemplo n.º 24
0
    def costAndGrad(self, data, labels=None, sentence=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf, _ = self.stack[-2]
            wtb, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf, _ = self.grad[-2]
                dwtb, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop #TODO copy to device here
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0, self.maxAct, col=0)
                self.hActsBack.minmax(0.0, self.maxAct, col=T - 1)
                for t in xrange(1, T):
                    cm.mvdot_col_slice(wtf,
                                       self.hActsFor,
                                       t - 1,
                                       self.hActsFor,
                                       t,
                                       beta=1.0)
                    self.hActsFor.minmax(0.0, self.maxAct, col=t)
                    cm.mvdot_col_slice(wtb,
                                       self.hActsBack,
                                       T - t,
                                       self.hActsBack,
                                       T - t - 1,
                                       beta=1.0)
                    self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1)
                self.hActsFor.add(self.hActsBack, target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm()**2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor)
                self.hActsBack.within(0.0,
                                      self.maxAct,
                                      target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1)
                self.deltasBack.mult_slice(0, self.tmpGradBack, 0)

                for t in xrange(1, T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,
                                       self.deltasFor,
                                       T - t,
                                       self.deltasFor,
                                       T - t - 1,
                                       beta=1.0)
                    cm.mvdot_col_slice(wtb.T,
                                       self.deltasBack,
                                       t - 1,
                                       self.deltasBack,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor,
                                              T - t - 1)
                    self.deltasBack.mult_slice(t, self.tmpGradBack, t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1, T),
                       self.hActsFor.get_col_slice(0, T - 1).T,
                       target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0, T - 1),
                       self.hActsBack.get_col_slice(1, T).T,
                       target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack, target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost, self.grad, skip
Ejemplo n.º 25
0
def sinkhorn(a,
             b,
             M_GPU,
             reg,
             numItermax=1000,
             stopThr=1e-9,
             verbose=False,
             log=False,
             returnAsGPU=False):
    # init data
    Nini = len(a)
    Nfin = len(b)

    if log:
        log = {'err': []}

    # we assume that no distances are null except those of the diagonal of
    # distances
    u = (np.ones(Nini) / Nini).reshape((Nini, 1))
    u_GPU = cudamat.CUDAMatrix(u)
    a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1)))
    ones_GPU = cudamat.empty(u_GPU.shape).assign(1)
    v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1))
    v_GPU = cudamat.CUDAMatrix(v)
    b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1)))

    M_GPU.divide(-reg)

    K_GPU = cudamat.exp(M_GPU)

    ones_GPU.divide(a_GPU, target=a_GPU)
    Kp_GPU = cudamat.empty(K_GPU.shape)
    K_GPU.mult_by_col(a_GPU, target=Kp_GPU)

    tmp_GPU = cudamat.empty(K_GPU.shape)

    cpt = 0
    err = 1
    while (err > stopThr and cpt < numItermax):
        uprev_GPU = u_GPU.copy()
        vprev_GPU = v_GPU.copy()

        KtransposeU_GPU = K_GPU.transpose().dot(u_GPU)
        b_GPU.divide(KtransposeU_GPU, target=v_GPU)
        ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU)

        if (np.any(KtransposeU_GPU.asarray() == 0) or not u_GPU.allfinite()
                or not v_GPU.allfinite()):
            # we have reached the machine precision
            # come back to previous solution and quit loop
            print(('Warning: numerical errors at iteration', cpt))
            u_GPU = uprev_GPU.copy()
            v_GPU = vprev_GPU.copy()
            break
        if cpt % 10 == 0:
            # we can speed up the process by checking for the error only all
            # the 10th iterations
            K_GPU.mult_by_col(u_GPU, target=tmp_GPU)
            tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU)

            bcopy_GPU = b_GPU.copy().transpose()
            bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1)
            err = bcopy_GPU.euclid_norm()**2
            if log:
                log['err'].append(err)

            if verbose:
                if cpt % 200 == 0:
                    print(('{:5s}|{:12s}'.format('It.', 'Err') + '\n' +
                           '-' * 19))
                print(('{:5d}|{:8e}|'.format(cpt, err)))
        cpt += 1
    if log:
        log['u'] = u_GPU.asarray()
        log['v'] = v_GPU.asarray()

    K_GPU.mult_by_col(u_GPU, target=K_GPU)
    K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU)

    if returnAsGPU:
        res = K_GPU
    else:
        res = K_GPU.asarray()

    if log:
        return res, log
    else:
        return res
Ejemplo n.º 26
0
 def draw_HMC_samples(self, data, negdata, normdata, vel, gradient,
                      normgradient, new_energy, old_energy, VF, FH,
                      bias_cov, bias_vis, w_mean, bias_mean, hmc_step,
                      hmc_step_nr, hmc_ave_rej, hmc_target_ave_rej, t1, t2,
                      t3, t4, t5, t6, t7, thresh, feat, featsq, batch_size,
                      feat_mean, length, lengthsq, normcoeff, small,
                      num_vis):
     vel.fill_with_randn()
     negdata.assign(data)
     self.compute_energy_mcRBM(negdata, normdata, vel, old_energy, VF, FH,
                               bias_cov, bias_vis, w_mean, bias_mean, t1,
                               t2, t6, feat, featsq, feat_mean, length,
                               lengthsq, normcoeff, small, num_vis)
     self.compute_gradient_mcRBM(negdata, normdata, VF, FH, bias_cov,
                                 bias_vis, w_mean, bias_mean, t1, t2, t3,
                                 t4, t6, feat, featsq, feat_mean, gradient,
                                 normgradient, length, lengthsq, normcoeff,
                                 small, num_vis)
     # half step
     vel.add_mult(gradient, -0.5 * hmc_step)
     negdata.add_mult(vel, hmc_step)
     # full leap-frog steps
     for ss in range(hmc_step_nr - 1):
         ## re-evaluate the gradient
         self.compute_gradient_mcRBM(negdata, normdata, VF, FH, bias_cov,
                                     bias_vis, w_mean, bias_mean, t1, t2,
                                     t3, t4, t6, feat, featsq, feat_mean,
                                     gradient, normgradient, length,
                                     lengthsq, normcoeff, small, num_vis)
         # update variables
         vel.add_mult(gradient, -hmc_step)
         negdata.add_mult(vel, hmc_step)
     # final half-step
     self.compute_gradient_mcRBM(negdata, normdata, VF, FH, bias_cov,
                                 bias_vis, w_mean, bias_mean, t1, t2, t3,
                                 t4, t6, feat, featsq, feat_mean, gradient,
                                 normgradient, length, lengthsq, normcoeff,
                                 small, num_vis)
     vel.add_mult(gradient, -0.5 * hmc_step)
     # compute new energy
     self.compute_energy_mcRBM(negdata, normdata, vel, new_energy, VF, FH,
                               bias_cov, bias_vis, w_mean, bias_mean, t1,
                               t2, t6, feat, featsq, feat_mean, length,
                               lengthsq, normcoeff, small, num_vis)
     # rejecton
     old_energy.subtract(new_energy, target=thresh)
     cmt.exp(thresh)
     t4.fill_with_rand()
     t4.less_than(thresh)
     #    update negdata and rejection rate
     t4.mult(-1)
     t4.add(1)  # now 1's detect rejections
     t4.sum(axis=1, target=t5)
     t5.copy_to_host()
     rej = t5.numpy_array[0, 0] / batch_size
     data.mult_by_row(t4, target=t6)
     negdata.mult_by_row(t4, target=t7)
     negdata.subtract(t7)
     negdata.add(t6)
     hmc_ave_rej = 0.9 * hmc_ave_rej + 0.1 * rej
     if hmc_ave_rej < hmc_target_ave_rej:
         hmc_step = min(hmc_step * 1.01, 0.25)
     else:
         hmc_step = max(hmc_step * 0.99, .001)
     return hmc_step, hmc_ave_rej
Ejemplo n.º 27
0
    def costAndGrad(self,data,labels=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1,T):
                    self.hActs[i].minmax(0.0,self.maxAct,col=t-1)
                    cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0)
                self.hActs[i].minmax(0.0,self.maxAct,col=T-1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
	if not self.train:
	    return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T-1,0,-1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t,self.tmpGrad,t) 
                    self.deltaTemp.set_single_col(t-1,deltasOut,t)

 
                # Accumulate temporal gradient
                cm.dot(self.deltaTemp,self.hActs[i].T,
                        target=dwt)

                cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0)
                deltasOut.mult_slice(0,self.tmpGrad,0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
def ff(x0_cpu):
    data_size = x0_cpu.shape[1]
    x_l0 = cm.empty((num_input, data_size))
    x_l0.assign(cm.CUDAMatrix(x0_cpu))
                
    x_l1 = cm.empty((num_hid, data_size))

    cm.dot(w1.T, x_l0, target = x_l1)
    x_l1.add_col_vec(b1)
    x_l1.apply_sigmoid()

    x_l2 = cm.empty((num_hid, data_size))
    del x_l0

    cm.dot(w2.T, x_l1, target = x_l2)
    x_l2.add_col_vec(b2)
    x_l2.apply_sigmoid()

    x_l3 = cm.empty((num_hid, data_size))
    del x_l1

    cm.dot(w3.T, x_l2, target = x_l3)
    x_l3.add_col_vec(b3)
    x_l3.apply_sigmoid()

    x_l4 = cm.empty((num_hid, data_size))
    del x_l2

    cm.dot(w4.T, x_l3, target = x_l4)
    x_l4.add_col_vec(b4)
    x_l4.apply_sigmoid()

    x_l5 = cm.empty((num_hid, data_size))
    del x_l3

    cm.dot(w5.T, x_l4, target = x_l5)
    x_l5.add_col_vec(b5)
    x_l5.apply_sigmoid()

    x_output = cm.empty((num_output, data_size))
    del x_l4

    tmp_x_output = cm.empty((num_output, data_size))
    tmp_x_output_sums = cm.empty((1, data_size))

    cm.dot(wo.T, x_l5, target = tmp_x_output)
    tmp_x_output.add_col_vec(bo)
    cm.exp(tmp_x_output)
    tmp_x_output.sum(axis=0, target = tmp_x_output_sums)
    tmp_x_output_sums.reciprocal()
    tmp_x_output.mult_by_row(tmp_x_output_sums)
    x_output.assign(tmp_x_output)

    x_output.mult_by_col(state_prior_gpu_rec)
    cm.log(x_output)

    x_output.mult(1./np.log(10))

    xo = x_output.asarray()

    return xo
Ejemplo n.º 29
0
def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False,
                log=False, returnAsGPU=False):
    r"""
    Solve the entropic regularization optimal transport problem on GPU

    The function solves the following optimization problem:

    .. math::
        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)

        s.t. \gamma 1 = a

             \gamma^T 1= b

             \gamma\geq 0
    where :

    - M is the (ns,nt) metric cost matrix
    - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
    - a and b are source and target weights (sum to 1)

    The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_


    Parameters
    ----------
    a : np.ndarray (ns,)
        samples weights in the source domain
    b : np.ndarray (nt,)
        samples in the target domain
    M_GPU : cudamat.CUDAMatrix (ns,nt)
        loss matrix
    reg : float
        Regularization term >0
    numItermax : int, optional
        Max number of iterations
    stopThr : float, optional
        Stop threshol on error (>0)
    verbose : bool, optional
        Print information along iterations
    log : bool, optional
        record log if True
    returnAsGPU : bool, optional
        return the OT matrix as a cudamat.CUDAMatrix

    Returns
    -------
    gamma : (ns x nt) ndarray
        Optimal transportation matrix for the given parameters
    log : dict
        log dictionary return only if log==True in parameters

    Examples
    --------

    >>> import ot
    >>> a=[.5,.5]
    >>> b=[.5,.5]
    >>> M=[[0.,1.],[1.,0.]]
    >>> ot.sinkhorn(a,b,M,1)
    array([[ 0.36552929,  0.13447071],
           [ 0.13447071,  0.36552929]])


    References
    ----------

    .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013


    See Also
    --------
    ot.lp.emd : Unregularized OT
    ot.optim.cg : General regularized OT

    """
    # init data
    Nini = len(a)
    Nfin = len(b)

    if log:
        log = {'err': []}

    # we assume that no distances are null except those of the diagonal of
    # distances
    u = (np.ones(Nini) / Nini).reshape((Nini, 1))
    u_GPU = cudamat.CUDAMatrix(u)
    a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1)))
    ones_GPU = cudamat.empty(u_GPU.shape).assign(1)
    v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1))
    v_GPU = cudamat.CUDAMatrix(v)
    b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1)))

    M_GPU.divide(-reg)

    K_GPU = cudamat.exp(M_GPU)

    ones_GPU.divide(a_GPU, target=a_GPU)
    Kp_GPU = cudamat.empty(K_GPU.shape)
    K_GPU.mult_by_col(a_GPU, target=Kp_GPU)

    tmp_GPU = cudamat.empty(K_GPU.shape)

    cpt = 0
    err = 1
    while (err > stopThr and cpt < numItermax):
        uprev_GPU = u_GPU.copy()
        vprev_GPU = v_GPU.copy()

        KtransposeU_GPU = K_GPU.transpose().dot(u_GPU)
        b_GPU.divide(KtransposeU_GPU, target=v_GPU)
        ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU)

        if (np.any(KtransposeU_GPU.asarray() == 0) or
                not u_GPU.allfinite() or not v_GPU.allfinite()):
            # we have reached the machine precision
            # come back to previous solution and quit loop
            print('Warning: numerical errors at iteration', cpt)
            u_GPU = uprev_GPU.copy()
            v_GPU = vprev_GPU.copy()
            break
        if cpt % 10 == 0:
            # we can speed up the process by checking for the error only all
            # the 10th iterations
            K_GPU.mult_by_col(u_GPU, target=tmp_GPU)
            tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU)

            bcopy_GPU = b_GPU.copy().transpose()
            bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1)
            err = bcopy_GPU.euclid_norm()**2
            if log:
                log['err'].append(err)

            if verbose:
                if cpt % 200 == 0:
                    print(
                        '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                print('{:5d}|{:8e}|'.format(cpt, err))
        cpt += 1
    if log:
        log['u'] = u_GPU.asarray()
        log['v'] = v_GPU.asarray()

    K_GPU.mult_by_col(u_GPU, target=K_GPU)
    K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU)

    if returnAsGPU:
        res = K_GPU
    else:
        res = K_GPU.asarray()

    if log:
        return res, log
    else:
        return res
Ejemplo n.º 30
0
def ff(x0_cpu):
    data_size = x0_cpu.shape[1]
    x_l0 = cm.empty((num_input, data_size))
    x_l0.assign(cm.CUDAMatrix(x0_cpu))

    x_l1 = cm.empty((num_hid, data_size))

    cm.dot(w1.T, x_l0, target=x_l1)
    x_l1.add_col_vec(b1)
    x_l1.apply_sigmoid()

    x_l2 = cm.empty((num_hid, data_size))
    del x_l0

    cm.dot(w2.T, x_l1, target=x_l2)
    x_l2.add_col_vec(b2)
    x_l2.apply_sigmoid()

    x_l3 = cm.empty((num_hid, data_size))
    del x_l1

    cm.dot(w3.T, x_l2, target=x_l3)
    x_l3.add_col_vec(b3)
    x_l3.apply_sigmoid()

    x_l4 = cm.empty((num_hid, data_size))
    del x_l2

    cm.dot(w4.T, x_l3, target=x_l4)
    x_l4.add_col_vec(b4)
    x_l4.apply_sigmoid()

    x_l5 = cm.empty((num_hid, data_size))
    del x_l3

    cm.dot(w5.T, x_l4, target=x_l5)
    x_l5.add_col_vec(b5)
    x_l5.apply_sigmoid()

    x_output = cm.empty((num_output, data_size))
    del x_l4

    tmp_x_output = cm.empty((num_output, data_size))
    tmp_x_output_sums = cm.empty((1, data_size))

    cm.dot(wo.T, x_l5, target=tmp_x_output)
    tmp_x_output.add_col_vec(bo)
    cm.exp(tmp_x_output)
    tmp_x_output.sum(axis=0, target=tmp_x_output_sums)
    tmp_x_output_sums.reciprocal()
    tmp_x_output.mult_by_row(tmp_x_output_sums)
    x_output.assign(tmp_x_output)

    x_output.mult_by_col(state_prior_gpu_rec)
    cm.log(x_output)

    x_output.mult(1. / np.log(10))

    xo = x_output.asarray()

    return xo
Ejemplo n.º 31
0
    def score_samples(self, X, temp_gpu_mem=None):
        '''Return the per-sample likelihood of the data under the model.

        Compute the log probability of X under the model and
        return the posterior probability of each
        mixture component for each element of X.

        Parameters
        ----------
        X: numpy.ndarray, shape (n_samples, n_dimensions)
            Array of n_samples data points. Each row
            corresponds to a single data point.

        Returns
        -------
        logprob_Nx1 : array_like, shape (n_samples,)
            Log probabilities of each data point in X.

        posteriors : array_like, shape (n_samples, n_components)
            Posterior probability of each mixture component for each
            sample
        '''
        if None in (self.weights, self.means, self.covars):
            raise ValueError('GMM parameters have not been initialized')

        if X.shape[1] != self.n_dimensions:
            raise ValueError(
                'input data matrix X is of shape %s, should be %s'
                % (X.shape, (X.shape[0], self.n_dimensions)))

        N = X.shape[0]

        if temp_gpu_mem is None:
            temp_gpu_mem = TempGPUMem()
        temp_gpu_mem.alloc(N, self.n_components, self.n_dimensions)

        # lpr = log_multivariate_normal_density()
        #        + np.log(self.weights)[None, :]
        # -----------------------------------------------------
        posteriors_NxK = log_multivariate_normal_density(
            X, self.means, self.covars,
            self.covariance_type, temp_gpu_mem)
        # lpr += np.log(self.weights)
        temp_Kx1 = temp_gpu_mem['temp_Kx1']
        cm.log(self.weights, target=temp_Kx1)
        temp_Kx1.reshape((1, self.n_components))  # transpose
        posteriors_NxK.add_row_vec(temp_Kx1)
        temp_Kx1.reshape((self.n_components, 1))  # original shape
        # in use: lpr -> 'NxK'

        # logprob_Nx1 = np.log(np.sum(np.exp(lpr - vmax), axis=1))
        # logprob_Nx1 += vmax
        # ---------------------------------------------------------
        vmax_Nx1 = temp_gpu_mem['vmax_Nx1']
        logprob_Nx1 = temp_gpu_mem['logprob_Nx1']
        # vmax_Nx1 = np.max(lpr, axis=1)
        posteriors_NxK.max(axis=1, target=vmax_Nx1)
        # lpr -= vmax_Nx1[:, None]
        posteriors_NxK.add_col_mult(vmax_Nx1, -1.0)
        # posteriors_NxK = np.exp(posteriors_NxK)
        cm.exp(posteriors_NxK)
        # logprob_Nx1 = np.sum(posteriors_NxK, axis=1)
        posteriors_NxK.sum(axis=1, target=logprob_Nx1)
        # posteriors_NxK /= logprob_Nx1[:, None]
        posteriors_NxK.div_by_col(logprob_Nx1)

        # logprob_Nx1 = np.log(logprob_Nx1)
        cm.log(logprob_Nx1, target=logprob_Nx1)
        # logprob_Nx1 += vmax_Nx1
        logprob_Nx1.add(vmax_Nx1)

        return logprob_Nx1, posteriors_NxK
Ejemplo n.º 32
0
    def rbm_update(self,gpu_data,gpu_target_for_data=None,n_first_update=None):

        is_labeled = gpu_target_for_data != None

        if n_first_update != None:
            gpu_data.slice(n_first_update,self.minibatch_size).assign(0)

        self.dW.mult(self.momentum)
        self.dU.mult(self.momentum)
        self.dc.mult(self.momentum)
        self.db.mult(self.momentum)
        self.dd.mult(self.momentum)

        # Computes p(y|x). This methods fills in
        # self.gpu_p_y_given_x and self.gpu_p_y_given_x_trans with the result.
        # It also computes self.gpu_act_from_x.
        self.compute_output(gpu_data)

        if gpu_target_for_data != None:
            # Compute discriminative gradient
            self.gpu_p_y_given_x_trans.subtract(gpu_target_for_data,self.gpu_doutput)
            if n_first_update != None:
                # Making sure gradient is non-zero only for n_first_update first examples
                self.gpu_doutput.slice(n_first_update,self.minibatch_size).assign(0)

            self.gpu_doutput.sum(axis=1,target=self.gpu_doutput_sum)
            self.dd.add_dot(self.gpu_target_vectors,self.gpu_doutput_sum)
            self.gpu_dhidact.assign(0)
            self.gpu_doutput.transpose(self.gpu_doutput_trans)
            for c in range(self.n_classes):
                cm.dot(self.U,self.gpu_target_vectors.slice(c,c+1),self.gpu_act_from_y)
                # to avoid memory creation, using gpu_h
                # and gpu_h_sample for these computations
                self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y,target=self.gpu_h)
                self.gpu_h.apply_sigmoid()
                self.gpu_doutput_trans.slice(c,c+1).transpose(target=self.gpu_doutput_row)
                self.gpu_h.mult_by_row(self.gpu_doutput_row)
                self.gpu_dhidact.add(self.gpu_h)
                self.dc.add_sums(self.gpu_h,axis=1)
                self.gpu_h.sum(axis=1,target=self.gpu_dhidact_sum)
                self.dU.add_dot(self.gpu_dhidact_sum,self.gpu_target_vectors.slice(c,c+1).T)

            self.dW.add_dot(self.gpu_dhidact,gpu_data.T)

        else:
            # Sample a y according to p(y|x)
            # ... actually, we use the softmax probs, it's much simpler
            gpu_target_for_data = self.gpu_p_y_given_x_trans

        if (is_labeled and self.gen_learning_weight > 0) or (not is_labeled and self.semisup_learning_weight > 0):

            self.cd_W.assign(0)
            self.cd_U.assign(0)
            self.cd_c.assign(0)
            self.cd_b.assign(0)
            self.cd_d.assign(0)

            # Positive phase
            cm.dot(self.W,gpu_data,self.gpu_h)
            self.gpu_h.add_col_vec(self.c)
            cm.dot(self.gpu_target_vectors,gpu_target_for_data,self.gpu_target_vec_pos)
            self.gpu_h.add_dot(self.U,self.gpu_target_vec_pos)
            self.gpu_h.apply_sigmoid()
            
            if n_first_update != None:
                # A simple fix for having a non-zero gradient only for n_first_update examples
                self.gpu_target_vec_pos.slice(n_first_update,self.minibatch_size).assign(0)
                self.gpu_h.slice(n_first_update,self.minibatch_size).assign(0)

            self.cd_W.subtract_dot(self.gpu_h,gpu_data.T)
            self.cd_U.subtract_dot(self.gpu_h,self.gpu_target_vec_pos.T)
            self.cd_c.add_sums(self.gpu_h,axis=1,mult=-1.)
            self.cd_b.add_sums(gpu_data,axis=1,mult=-1.)
            self.cd_d.add_sums(self.gpu_target_vec_pos,axis=1,mult=-1.)

            if self.use_persistent_chain:
                cm.dot(self.W,self.gpu_x_persistent,self.gpu_h)
                self.gpu_h.add_col_vec(self.c)
                self.gpu_h.add_dot(self.U,self.gpu_y_persistent)
                self.gpu_h.apply_sigmoid()
            
            for it in range(self.n_gibbs_steps):
                self.gpu_h_sample.fill_with_rand()
                self.gpu_h_sample.less_than(self.gpu_h)
            
                # Down pass
                cm.dot(self.W.T,self.gpu_h_sample,self.gpu_x)
                self.gpu_x.add_col_vec(self.b)
                self.gpu_x.apply_sigmoid()
                self.gpu_x_sample.fill_with_rand()
                self.gpu_x_sample.less_than(self.gpu_x)

                cm.dot(self.U.T,self.gpu_h_sample,self.gpu_y)
                self.gpu_y.add_col_vec(self.d)
                cm.dot(self.gpu_target_vectors.T,self.gpu_y,self.gpu_target_vec_neg)
                self.gpu_target_vec_neg.transpose(self.gpu_y_trans)
                self.gpu_y_trans.sum(axis=1,target=self.gpu_y_trans_mean)
                self.gpu_y_trans_mean.divide(-self.n_classes)
                self.gpu_y_trans.add_col_vec(self.gpu_y_trans_mean)
                cm.exp(self.gpu_y_trans,target=self.gpu_y_trans)
                self.gpu_y_trans.sum(axis=1,target=self.gpu_y_trans_norm)
                for c in range(self.n_classes):                
                    self.gpu_y_trans.slice(c,c+1).divide(self.gpu_y_trans_norm)
                self.gpu_y_trans.transpose(self.gpu_y)

                # Up pass
                cm.dot(self.W,self.gpu_x_sample,self.gpu_h)
                self.gpu_h.add_col_vec(self.c)
                cm.dot(self.gpu_target_vectors,self.gpu_y,self.gpu_target_vec_neg)
                self.gpu_h.add_dot(self.U,self.gpu_target_vec_neg)
                self.gpu_h.apply_sigmoid()
            
            if self.use_persistent_chain:
                # Remember Gibbs chain's state
                self.gpu_x_persistent.assign(self.gpu_x_sample)
                self.gpu_y_persistent.assign(self.gpu_target_vec_neg)

            if n_first_update != None:
                self.gpu_x_sample.slice(n_first_update,self.minibatch_size).assign(0)
                self.gpu_target_vec_neg.slice(n_first_update,self.minibatch_size).assign(0)
                self.gpu_h.slice(n_first_update,self.minibatch_size).assign(0)

            
            self.cd_W.add_dot(self.gpu_h,self.gpu_x_sample.T)
            self.cd_U.add_dot(self.gpu_h,self.gpu_target_vec_neg.T)
            self.cd_c.add_sums(self.gpu_h,axis=1)
            self.cd_b.add_sums(self.gpu_x_sample,axis=1)
            self.cd_d.add_sums(self.gpu_target_vec_neg,axis=1)

            # Update RBM
            if is_labeled:
                alpha = self.gen_learning_weight
            else:
                alpha = self.semisup_learning_weight

            self.dW.add_mult(self.cd_W,alpha=alpha)
            self.dU.add_mult(self.cd_U,alpha=alpha)
            self.dc.add_mult(self.cd_c,alpha=alpha)
            self.db.add_mult(self.cd_b,alpha=alpha)
            self.dd.add_mult(self.cd_d,alpha=alpha)

        if n_first_update == None:
            lr = self.learning_rate/self.minibatch_size
        else:
            lr = self.learning_rate/n_first_update

        self.W.add_mult(self.dW,alpha=-lr)
        self.U.add_mult(self.dU,alpha=-lr)
        self.c.add_mult(self.dc,alpha=-lr)
        self.b.add_mult(self.db,alpha=-lr)
        self.d.add_mult(self.dd,alpha=-lr)
Ejemplo n.º 33
0
    def rbm_update(self,
                   gpu_data,
                   gpu_target_for_data=None,
                   n_first_update=None):

        is_labeled = gpu_target_for_data != None

        if n_first_update != None:
            gpu_data.slice(n_first_update, self.minibatch_size).assign(0)

        self.dW.mult(self.momentum)
        self.dU.mult(self.momentum)
        self.dc.mult(self.momentum)
        self.db.mult(self.momentum)
        self.dd.mult(self.momentum)

        # Computes p(y|x). This methods fills in
        # self.gpu_p_y_given_x and self.gpu_p_y_given_x_trans with the result.
        # It also computes self.gpu_act_from_x.
        self.compute_output(gpu_data)

        if gpu_target_for_data != None:
            # Compute discriminative gradient
            self.gpu_p_y_given_x_trans.subtract(gpu_target_for_data,
                                                self.gpu_doutput)
            if n_first_update != None:
                # Making sure gradient is non-zero only for n_first_update first examples
                self.gpu_doutput.slice(n_first_update,
                                       self.minibatch_size).assign(0)

            self.gpu_doutput.sum(axis=1, target=self.gpu_doutput_sum)
            self.dd.add_dot(self.gpu_target_vectors, self.gpu_doutput_sum)
            self.gpu_dhidact.assign(0)
            self.gpu_doutput.transpose(self.gpu_doutput_trans)
            for c in range(self.n_classes):
                cm.dot(self.U, self.gpu_target_vectors.slice(c, c + 1),
                       self.gpu_act_from_y)
                # to avoid memory creation, using gpu_h
                # and gpu_h_sample for these computations
                self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y,
                                                target=self.gpu_h)
                self.gpu_h.apply_sigmoid()
                self.gpu_doutput_trans.slice(
                    c, c + 1).transpose(target=self.gpu_doutput_row)
                self.gpu_h.mult_by_row(self.gpu_doutput_row)
                self.gpu_dhidact.add(self.gpu_h)
                self.dc.add_sums(self.gpu_h, axis=1)
                self.gpu_h.sum(axis=1, target=self.gpu_dhidact_sum)
                self.dU.add_dot(self.gpu_dhidact_sum,
                                self.gpu_target_vectors.slice(c, c + 1).T)

            self.dW.add_dot(self.gpu_dhidact, gpu_data.T)

        else:
            # Sample a y according to p(y|x)
            # ... actually, we use the softmax probs, it's much simpler
            gpu_target_for_data = self.gpu_p_y_given_x_trans

        if (is_labeled and self.gen_learning_weight > 0) or (
                not is_labeled and self.semisup_learning_weight > 0):

            self.cd_W.assign(0)
            self.cd_U.assign(0)
            self.cd_c.assign(0)
            self.cd_b.assign(0)
            self.cd_d.assign(0)

            # Positive phase
            cm.dot(self.W, gpu_data, self.gpu_h)
            self.gpu_h.add_col_vec(self.c)
            cm.dot(self.gpu_target_vectors, gpu_target_for_data,
                   self.gpu_target_vec_pos)
            self.gpu_h.add_dot(self.U, self.gpu_target_vec_pos)
            self.gpu_h.apply_sigmoid()

            if n_first_update != None:
                # A simple fix for having a non-zero gradient only for n_first_update examples
                self.gpu_target_vec_pos.slice(n_first_update,
                                              self.minibatch_size).assign(0)
                self.gpu_h.slice(n_first_update, self.minibatch_size).assign(0)

            self.cd_W.subtract_dot(self.gpu_h, gpu_data.T)
            self.cd_U.subtract_dot(self.gpu_h, self.gpu_target_vec_pos.T)
            self.cd_c.add_sums(self.gpu_h, axis=1, mult=-1.)
            self.cd_b.add_sums(gpu_data, axis=1, mult=-1.)
            self.cd_d.add_sums(self.gpu_target_vec_pos, axis=1, mult=-1.)

            if self.use_persistent_chain:
                cm.dot(self.W, self.gpu_x_persistent, self.gpu_h)
                self.gpu_h.add_col_vec(self.c)
                self.gpu_h.add_dot(self.U, self.gpu_y_persistent)
                self.gpu_h.apply_sigmoid()

            for it in range(self.n_gibbs_steps):
                self.gpu_h_sample.fill_with_rand()
                self.gpu_h_sample.less_than(self.gpu_h)

                # Down pass
                cm.dot(self.W.T, self.gpu_h_sample, self.gpu_x)
                self.gpu_x.add_col_vec(self.b)
                self.gpu_x.apply_sigmoid()
                self.gpu_x_sample.fill_with_rand()
                self.gpu_x_sample.less_than(self.gpu_x)

                cm.dot(self.U.T, self.gpu_h_sample, self.gpu_y)
                self.gpu_y.add_col_vec(self.d)
                cm.dot(self.gpu_target_vectors.T, self.gpu_y,
                       self.gpu_target_vec_neg)
                self.gpu_target_vec_neg.transpose(self.gpu_y_trans)
                self.gpu_y_trans.sum(axis=1, target=self.gpu_y_trans_mean)
                self.gpu_y_trans_mean.divide(-self.n_classes)
                self.gpu_y_trans.add_col_vec(self.gpu_y_trans_mean)
                cm.exp(self.gpu_y_trans, target=self.gpu_y_trans)
                self.gpu_y_trans.sum(axis=1, target=self.gpu_y_trans_norm)
                for c in range(self.n_classes):
                    self.gpu_y_trans.slice(c,
                                           c + 1).divide(self.gpu_y_trans_norm)
                self.gpu_y_trans.transpose(self.gpu_y)

                # Up pass
                cm.dot(self.W, self.gpu_x_sample, self.gpu_h)
                self.gpu_h.add_col_vec(self.c)
                cm.dot(self.gpu_target_vectors, self.gpu_y,
                       self.gpu_target_vec_neg)
                self.gpu_h.add_dot(self.U, self.gpu_target_vec_neg)
                self.gpu_h.apply_sigmoid()

            if self.use_persistent_chain:
                # Remember Gibbs chain's state
                self.gpu_x_persistent.assign(self.gpu_x_sample)
                self.gpu_y_persistent.assign(self.gpu_target_vec_neg)

            if n_first_update != None:
                self.gpu_x_sample.slice(n_first_update,
                                        self.minibatch_size).assign(0)
                self.gpu_target_vec_neg.slice(n_first_update,
                                              self.minibatch_size).assign(0)
                self.gpu_h.slice(n_first_update, self.minibatch_size).assign(0)

            self.cd_W.add_dot(self.gpu_h, self.gpu_x_sample.T)
            self.cd_U.add_dot(self.gpu_h, self.gpu_target_vec_neg.T)
            self.cd_c.add_sums(self.gpu_h, axis=1)
            self.cd_b.add_sums(self.gpu_x_sample, axis=1)
            self.cd_d.add_sums(self.gpu_target_vec_neg, axis=1)

            # Update RBM
            if is_labeled:
                alpha = self.gen_learning_weight
            else:
                alpha = self.semisup_learning_weight

            self.dW.add_mult(self.cd_W, alpha=alpha)
            self.dU.add_mult(self.cd_U, alpha=alpha)
            self.dc.add_mult(self.cd_c, alpha=alpha)
            self.db.add_mult(self.cd_b, alpha=alpha)
            self.dd.add_mult(self.cd_d, alpha=alpha)

        if n_first_update == None:
            lr = self.learning_rate / self.minibatch_size
        else:
            lr = self.learning_rate / n_first_update

        self.W.add_mult(self.dW, alpha=-lr)
        self.U.add_mult(self.dU, alpha=-lr)
        self.c.add_mult(self.dc, alpha=-lr)
        self.b.add_mult(self.db, alpha=-lr)
        self.d.add_mult(self.dd, alpha=-lr)
Ejemplo n.º 34
0
    def costAndGrad(self,data,labels=None, sentence=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf,_ = self.stack[-2]
            wtb,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf,_ = self.grad[-2]
                dwtb,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop #TODO copy to device here 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0,self.maxAct,col=0)
                self.hActsBack.minmax(0.0,self.maxAct,col=T-1)
                for t in xrange(1,T):
                    cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0)
                    self.hActsFor.minmax(0.0,self.maxAct,col=t)
                    cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0)
                    self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1)
                self.hActsFor.add(self.hActsBack,target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm() ** 2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor)
                self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1)
                self.deltasBack.mult_slice(0,self.tmpGradBack,0)

                for t in xrange(1,T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t,
                                       self.deltasFor,T-t-1,beta=1.0)
                    cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1,
                                       self.deltasBack,t,beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1)
                    self.deltasBack.mult_slice(t,self.tmpGradBack,t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1,T),
                        self.hActsFor.get_col_slice(0,T-1).T,target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0,T-1),
                        self.hActsBack.get_col_slice(1,T).T,target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack,target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost,self.grad,skip
Ejemplo n.º 35
0
    def costAndGrad(self, data, labels=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1, T):
                    self.hActs[i].minmax(0.0, self.maxAct, col=t - 1)
                    cm.mvdot_col_slice(wt,
                                       self.hActs[i],
                                       t - 1,
                                       self.hActs[i],
                                       t,
                                       beta=1.0)
                self.hActs[i].minmax(0.0, self.maxAct, col=T - 1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            return ctc.decode_best_path(
                self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T - 1, 0, -1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,
                                       self.deltaTemp,
                                       t,
                                       deltasOut,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t, self.tmpGrad, t)
                    self.deltaTemp.set_single_col(t - 1, deltasOut, t)

                # Accumulate temporal gradient
                cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt)

                cm.mvdot_col_slice(wt.T,
                                   self.deltaTemp,
                                   0,
                                   deltasOut,
                                   0,
                                   beta=1.0)
                deltasOut.mult_slice(0, self.tmpGrad, 0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1

        return cost, self.grad, skip