    def __init__(self, config, name):
        super(AE, self).__init__(config, name) 

        #dimension of hidden layer
        self.hDim = int(self.readField(config, name, "hidden_dimension"))

        #dimension of visible layer
        self.vDim = int(self.readField(config, name, "visible_dimension"))

        #baise for hidden layer
        if self.hDim>0:
            self.b1 = gp.zeros(self.hDim)

        #biase for visible layer
        if self.vDim>0:
            self.b2 = gp.zeros(self.vDim)

        #init weight: uniform between +-sqrt(6)/sqrt(v+h+1)
        if self.hDim*self.vDim>0:
            self.W1 = gp.randn(self.vDim, self.hDim) * 2 * r - r
            self.W2 = gp.randn(self.hDim, self.vDim) * 2 * r - r
            self.initHyperParam(config, name)
 def initRandom(self):
     r = gp.sqrt(6) / gp.sqrt(self.hDim + self.vDim + 1)
     self.W1 = gp.randn(self.vDim, self.hDim) * 2 * r - r
     self.W2 = gp.randn(self.hDim, self.vDim) * 2 * r - r
     self.initHyperParam(self.config, self.name)
    def epoch_update(self, input_batch, output_batch):
        this_batch_size = input_batch.shape[0]

        if not isinstance(input_batch, gnp.garray):
            input_batch = gnp.garray(input_batch)
        if not isinstance(output_batch, gnp.garray):
            output_batch = gnp.garray(output_batch)

        error_residual, output_result, error = self.feed_back(input_batch, output_batch)

        for i, (w_grad, b_grad) in enumerate(self.gradients(self.state, error_residual)):
            self.weight_grads_l2_norm[i] += (w_grad/this_batch_size - self.l2*self.weights[i]) ** 2
            self.bias_gradis_l2_norm[i] += (b_grad/this_batch_size) ** 2
            w_factor = 1 / gnp.sqrt(self.weight_grads_l2_norm[i])
            b_factor = 1 / gnp.sqrt(self.bias_gradis_l2_norm[i])
            self.weight_grads[i] = self.learning_rate * w_factor * (
                w_grad/this_batch_size - self.l2*self.weights[i])
            self.bias_grads[i] = (self.learning_rate*b_factor/this_batch_size) * b_grad

        for i in range(len(self.weights)):
            self.weights[i] += self.weight_grads[i]
            self.biases[i] += self.bias_grads[i]

        return error, output_result
    def initParams(self):
        # crude way of random initialization (random seed) for parameters
        import time
        self.seed = int(time.time()) % 100000
        # for tt in range(self.seed): gp.rand()

        sizes = [self.inputDim] + self.layerSizes + [self.outputDim]
        scales = [
            gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:])
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s, self.mbSize)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s, self.mbSize)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),
                          gp.empty(b.shape)] for w, b in self.stack]
            for tt in range(self.seed):

            self.stack = [[
                ws[0] + .01 * gp.randn(ws[0].shape),
                ws[1] + .01 * gp.randn(ws[1].shape)
            ] for ws in self.stack]
 def _updateWeights(self, dEdW):
     if self.gpu:
         if self.gradientClip > 0.0:
             self.dEdWnorm = gpu.sqrt(gpu.sum(dEdW**2))
             if self.dEdWnorm > self.gradientClip:
                 dEdW *= self.gradientClip / self.dEdWnorm
         if self.learningRate > 0.0:
             self.lastdW = -self.learningRate * dEdW + \
                        self.momentum * self.lastdW
             self.W += self.lastdW
         if self.weightRegConst > 0.0:
             a = self.learningRate * self.weightRegConst
             self.W -= a * self.W
         if self.weightClip > 0.0:
             self.Wnorm = gpu.sqrt(gpu.sum(self.W**2))
             if self.Wnorm > self.weightClip:
                 self.W *= self.weightClip / self.Wnorm
         if self.gradientClip > 0.0:
             self.dEdWnorm = np.sqrt(np.sum(np.power(dEdW, 2)))
             if self.dEdWnorm > self.gradientClip:
                 dEdW *= self.gradientClip / self.dEdWnorm
         if self.learningRate > 0.0:
             self.lastdW = -self.learningRate * dEdW + \
                        self.momentum * self.lastdW
             self.W += self.lastdW
         if self.weightRegConst > 0.0:
             a = self.learningRate * self.weightRegConst
             self.W -= a * self.W
         if self.weightClip > 0.0:
             self.Wnorm = np.sqrt(np.sum(np.power(self.W, 2)))
             if self.Wnorm > self.weightClip:
                 self.W *= self.weightClip / self.Wnorm
def linear_decoder_run_gpu(data, numInput, numHidden):
    print "Starting Feature Abstraction..."
    num_input = numInput
    num_hidden = numHidden
    num_output = numInput
    lambda_val = 3e-3
    sparsityParam = 0.035
    beta = 5
    inputs = data
    r = gpu.sqrt(6)/gpu.sqrt(num_hidden+num_input+1)
    weights1 = (gpu.rand(num_hidden,num_input+1))*2*r-r
    weights2 = (gpu.rand(num_output,num_hidden+1))*2*r-r
    num_weights1 = (num_input+1)*num_hidden
    num_weights2 = (num_hidden+1)*num_output
    #weights1 = reshape(weights1, num_weights1)
    weights1 = weights1.reshape(num_weights1)
    #weights2 = reshape(weights2, num_weights2)
    weights2 = weights2.reshape(num_weights2)
    weights = hstack((weights1.as_numpy_array(),weights2.as_numpy_array()))
    args = (num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta)
    opttheta = optimize.fmin_l_bfgs_b(costfunc_gpu, weights, fprime=grad_costfunc_gpu, args=args, maxiter=400)
    weights = opttheta[0]
    weights1 = reshape(weights[0:num_weights1],(num_hidden,num_input+1))
    weights2 = reshape(weights[num_weights1:shape(weights)[0]], (num_output,num_hidden+1))
    scipy.io.savemat('learntFeaturesGPU.mat', mdict={'learntFeatures': weights1})
    return weights1
    def updateParams(self,scale,update,log=False):
	if log:
	    for w,u in zip(self.stack,update):
		wrms = gp.sqrt(gp.mean(w[0]**2))
		urms = gp.sqrt(gp.mean((scale*u[0])**2))
		print "weight rms=%f -- update rms=%f"%(wrms,urms)

        self.stack = [[ws[0]+scale*wsDelta[0],ws[1]+scale*wsDelta[1]] 
                        for ws,wsDelta in zip(self.stack,update)]
    def initParams(self):
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s,self.mbSize)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def _dist_cosine(X, Y):
    d_ij = 0.5 * (1 - <x_i,y_j> / |x_i| |y_j|)

    This is a variant of the cosine similarity, modified so that it is a
    distance, i.e. the smaller the closer, and the distance is between 0 and 1.
    X_norm = gnp.sqrt((X*X).sum(axis=1)) + 1e-10
    Y_norm = gnp.sqrt((Y*Y).sum(axis=1)) + 1e-10

    return (1 - X.dot(Y.T) / (X_norm.reshape(-1,1).dot(Y_norm.reshape(1,-1)))) / 2
def mlpSoftmax_test():
    numClasses = 10
    inputSize = 28 * 28
    l1Size = 100
    l2Size = 20
    lambda_softmax = 1e-4
    lambda_hidden = 8e-5
    print "Loading data..."
    inputs, labels, testData, testLabels = obtain_data()
    print shape(labels)
    print "Done."
    numCases = shape(inputs)[1]
    num_weights_L1 = l1Size * (inputSize + 1)
    num_weights_L2 = l2Size * (l1Size + 1)
    num_weights_softmax = numClasses * l2Size
    r = gpu.sqrt(6)/gpu.sqrt(inputSize+l1Size+l2Size+1)
    theta_L1 = (gpu.rand(l1Size, inputSize+1))*2*r-r
    theta_L2 = (gpu.rand(l2Size, l1Size+1))*2*r-r
    theta_softmax = (gpu.rand(numClasses, l2Size))*2*r-r
    groundTruth = ground_Truth(labels,numCases)
    #theta_L1 = reshape(theta_L1, num_weights_L1)
    theta_L1 = theta_L1.reshape(num_weights_L1)
    #theta_L2 = reshape(theta_L2, num_weights_L2)
    theta_L2 = theta_L2.reshape(num_weights_L2)
    #theta_softmax = reshape(theta_softmax, num_weights_softmax)
    theta_softmax = theta_softmax.reshape(num_weights_softmax)
    theta = hstack((theta_L1.as_numpy_array(), theta_L2.as_numpy_array(), theta_softmax.as_numpy_array()))
    args = (numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth)
    print "Starting Network Training..."
    opttheta = optimize.fmin_l_bfgs_b(mlpSoftmax_costfunc, theta, fprime=mlpSoftmax_grad, args=args, maxiter=400)
    theta = opttheta[0]
    print "Training finished."
    scipy.io.savemat('mlpSoftmax.mat', mdict={'theta': theta})
    print "Now testing prediction accuracy..."
    theta_L1 = reshape(theta[0:num_weights_L1], (l1Size, inputSize + 1))
    theta_L2 = reshape(theta[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1))
    theta_softmax = reshape(theta[num_weights_L2+num_weights_L1:shape(theta)[0]], (numClasses, l2Size))
    numCasesPred = shape(testData)[1]
    testData = concatenate((ones((1,numCasesPred)), testData), axis = 0)
    hidden_sum_L1 = dot(theta_L1, testData)
    hidden_activation_L1 = 1/(1 + exp(-hidden_sum_L1))
    hidden_activation_L1 = concatenate((ones((1,numCasesPred)), hidden_activation_L1), axis=0)
    hidden_sum_L2 = dot(theta_L2, hidden_activation_L1)
    hidden_activation_L2 = 1/(1 + exp(-hidden_sum_L2))
    hidden_sum_softmax = dot(theta_softmax, hidden_activation_L2)
    hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0)
    predictions = exp(hidden_sum_softmax)
    predictions = predictions / predictions.sum(axis = 0)
    pred = predictions.argmax(axis=0) + 1
    testLabels = squeeze(testLabels)
    accuracy = mean(pred == testLabels) * 100
    print "Accuracy: ", accuracy, "%"
    return pred, testLabels
 def apply_grad(self, learn_rate=1e-2,):
     """Apply the current accumulated gradients, with adagrad."""
     # Update the adagrad "momentums"
     self.moms['W'] = (0.95 * self.moms['W']) + (0.05 * self.grads['W']**2.0)
     self.moms['b'] = (0.95 * self.moms['b']) + (0.05 * self.grads['b']**2.0)
     # Apply adagrad-style updates using current grads and moms
     self.params['W'] -= learn_rate * (self.grads['W'] / \
             (gp.sqrt(self.moms['W']) + ADA_EPS))
     self.params['b'] -= learn_rate * (self.grads['b'] / \
             (gp.sqrt(self.moms['b']) + ADA_EPS))
     # Reset gradient accumulators
    def initParams(self):
	Initialize parameters using 6/sqrt(fanin+fanout)
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s,1)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s,1)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def multilayer_feature_learning(data, inputSize, l1Size, l2Size, sparsityParam, lambda_val, beta):
    print "Now starting feature abstraction..."
    num_input = inputSize
    num_hidden_L1 = l1Size
    num_hidden_L2 = l2Size
    num_output_L1 = inputSize
    num_output_L2 = num_hidden_L1
    sparsityParam = sparsityParam
    lambda_val = lambda_val
    beta = beta
    inputs = gpu.garray(data)
    r = gpu.sqrt(6)/gpu.sqrt(num_hidden_L1+num_input+1)
    weights1_L1 = (gpu.rand(num_hidden_L1,num_input+1))*2*r-r
    weights2_L1 = (gpu.rand(num_output_L1,num_hidden_L1+1))*2*r-r
    num_weights1_L1 = (num_input+1)*num_hidden_L1
    num_weights2_L1 = (num_hidden_L1+1)*num_output_L1
    weights1_L1 = weights1_L1.reshape(num_weights1_L1)
    weights2_L1 = weights2_L1.reshape(num_weights2_L1)
    weights_L1 = hstack((weights1_L1.as_numpy_array(),weights2_L1.as_numpy_array()))
    print "Level 1 Abstraction Starting...."
    weights_L1 = linear_decoder_run_ReLU(data, weights_L1, num_input, num_hidden_L1)
    weights1_L1 = weights_L1[0:num_weights1_L1].reshape((num_hidden_L1,num_input+1))
    weights2_L1 = weights_L1[num_weights1_L1:shape(weights_L1)[0]].reshape((num_output_L1,num_hidden_L1+1))
    scipy.io.savemat('HiggsBosonLevel1.mat', mdict={'learntFeaturesL1_1': weights1_L1, 'learntFeaturesL1_2': weights2_L1})
    L1_activation = feedforward(weights1_L1, inputs)
    del weights_L1
    del weights1_L1
    del weights2_L1
    v = gpu.sqrt(6)/gpu.sqrt(num_hidden_L2+num_hidden_L1+1)
    weights1_L2 = (gpu.rand(num_hidden_L2,num_hidden_L1+1))*2*v-v
    weights2_L2 = (gpu.rand(num_output_L2,num_hidden_L2+1))*2*v-v
    num_weights1_L2 = (num_hidden_L1+1)*num_hidden_L2
    num_weights2_L2 = (num_hidden_L2+1)*num_output_L2
    weights1_L2 = weights1_L2.reshape(num_weights1_L2)
    weights2_L2 = weights2_L2.reshape(num_weights2_L2)
    weights_L2 = hstack((weights1_L2.as_numpy_array(),weights2_L2.as_numpy_array()))
    print "Level 2 Abstraction Starting...."
    weights_L2 = linear_decoder_run_ReLU(L1_activation, weights_L2, num_hidden_L1, num_hidden_L2)
    weights1_L2 = weights_L2[0:num_weights1_L2].reshape((num_hidden_L2,num_hidden_L1+1))
    weights2_L2 = weights_L2[num_weights1_L2:shape(weights_L2)[0]].reshape((num_output_L2,num_hidden_L2+1))
    scipy.io.savemat('HiggsBosonLevel2.mat', mdict={'learntFeaturesL2_1': weights1_L2,'learntFeaturesL2_2': weights2_L2})
    L2_activation = feedforward(weights1_L2, L1_activation)
    del weights_L2
    del weights1_L2
    del weights2_L2
    print "Abstraction completed."
    return L2_activation
def checkGradientGPU():
    num_input = 8 * 8 * 3
    num_hidden = 10
    num_output = num_input
    lambda_val = 0.003
    sparsityParam = 0.035
    beta = 5
    data = scipy.io.loadmat('stlSampledPatches.mat')
    patches = data['patches']
    inputs = patches[:, 0:10]
    r = gpu.sqrt(6) / gpu.sqrt(num_hidden + num_input + 1)
    weights1 = (gpu.rand(num_hidden, num_input + 1)) * 2 * r - r
    weights2 = (gpu.rand(num_output, num_hidden + 1)) * 2 * r - r
    num_weights1 = (num_input + 1) * num_hidden
    num_weights2 = (num_hidden + 1) * num_output
    weights1 = weights1.reshape(num_weights1)
    weights2 = weights2.reshape(num_weights2)
    weights = hstack((weights1.as_numpy_array(), weights2.as_numpy_array()))
    args = (num_input, num_hidden, num_output, inputs, lambda_val,
            sparsityParam, beta)
    numgrad = zeros(size(weights))
    numgrad2 = zeros(size(weights))
    perturb = zeros(size(weights))
    e = 1e-4
    for p in range(size(weights)):
        perturb[p] = e
        minus_weights = weights - perturb
        plus_weights = weights + perturb
        loss1 = costfunc_gpuTRY(minus_weights, *args)
        lossc1 = costfunc(minus_weights, *args)
        loss2 = costfunc_gpu(plus_weights, *args)
        lossc2 = costfunc(plus_weights, *args)
        numgrad[p] = (loss2 - loss1) / (2 * e)
        numgrad2[p] = (lossc2 - lossc1) / (2 * e)
        perturb[p] = 0
    grad = grad_costfunc_gpu(weights, *args)
    grad2 = grad_costfunc(weights, *args)
    diff = linalg.norm(numgrad - grad) / linalg.norm(numgrad + grad)
    diff2 = linalg.norm(numgrad2 - grad2) / linalg.norm(numgrad2 + grad2)
    diff3 = linalg.norm(numgrad - grad2) / linalg.norm(numgrad + grad2)
    diff4 = linalg.norm(numgrad2 - grad) / linalg.norm(numgrad2 + grad)
    diffnum = linalg.norm(numgrad2 - numgrad) / linalg.norm(numgrad2 + numgrad)
    diffgrad = linalg.norm(grad2 - grad) / linalg.norm(grad2 + grad)
    print "pure GPU difference:", diff
    print "pure CPU difference:", diff2
    print "GPU cost, CPU grad:", diff3
    print "CPU cost, GPU grad:", diff4
    print "CPU cost and GPU cost difference:", diffnum
    print "CPU grad and GPU grad difference:", diffgrad
    return "OK"
def norm_trans(X, mode='ff'):
    """Compute feedforward and backprop for unit-normalization."""
    EPS = 0.00000001
    if (mode == 'ff'):
        N = gp.sqrt(gp.sum(X**2.0, axis=1) + EPS)
        N = N[:, gp.newaxis]
        F = X / N
    if (mode == 'bp'):
        N = gp.sqrt(gp.sum(X['X']**2.0, axis=1) + EPS)
        N = N[:, gp.newaxis]
        V = X['dLdA'] * X['X']
        V = gp.sum(V, axis=1)
        V = V[:, gp.newaxis]
        F = (X['dLdA'] / N) - (X['A'] * (V / (N**2.0)))
    return F
def rect_sqrt(x, computeGrad = False):
	if (not computeGrad):
		f = gp.sqrt(gp.abs(x)* (x>0))
		return f

	g = 1 / (2*x + (x<=0))*(x>0)
	return g
 def constrain_weights(self):
     for i, rms_limit in enumerate(self.rms_limits):
         if not rms_limit:
         W = self.weights[i]
         rms_scale = rms_limit / gnp.sqrt(gnp.mean(W*W, axis=0))
         limit_rms = W * (1+(rms_scale < 1) * (rms_scale - 1))
         self.weights[i] = limit_rms
 def clip_params(self, max_norm=10.0):
     """Bound L2 (row-wise) norm of W by max_norm."""
     M = self.params['W']
     m_scales = max_norm / gp.sqrt(gp.sum(M**2.0,axis=1) + 1e-5)
     mask = (m_scales < 1.0) # with gnumpy, this already comes as float32
     m_scales = (m_scales * mask) + (1.0 - mask)
     self.params['W'] = M * m_scales[:,gp.newaxis]
def cosSimilar(a, b):
    global gpu
    res = 0
    if gpu == 1:
#        print "gpu"
        a = gnumpy.garray(a)
        b = gnumpy.garray(b)
        len_a = gnumpy.sqrt(gnumpy.dot(a,a))
        len_b = gnumpy.sqrt(gnumpy.dot(b,b))
        res = gnumpy.dot(a, b) / (len_a * len_b)
        a = numpy.array(a)
        b = numpy.array(b)
        len_a = numpy.sqrt(numpy.dot(a,a))
        len_b = numpy.sqrt(numpy.dot(b,b))
        res = numpy.dot(a, b) / (len_a * len_b)
    return 0.5 + 0.5 * res
def limitColumnRMS(W, rmsLim):
    All columns of W with rms entry above the limit are scaled to equal the limit.
    The limit can either be a row vector or a scalar.
    Apply to 2-d array W.
    columnRMS = lambda W: gnp.sqrt(gnp.mean(W * W, axis=0))
    rmsScale = rmsLim / columnRMS(W)
    return W * (1 + (rmsScale < 1) * (rmsScale - 1))
 def fit(self, X):
     X = gnumpy.garray( X )
     lens = (X ** 2).sum(-1)  # precompute (squared) length of each vector
     if self._metric == 'angular':
         X /= gnumpy.sqrt(lens)[..., gnumpy.newaxis]  # normalize index vectors to unit length
         self.index = X # np.ascontiguousarray(X, dtype=self._precision)
     elif self._metric == 'euclidean':
         self.index = X # np.ascontiguousarray(X, dtype=self._precision)
         self.lengths = lens # np.ascontiguousarray(lens, dtype=self._precision)
def _dist_euclidean(X, Y):
    d_ij = (x_i - y_j)^2
    X = gnp.as_garray(X)
    Y = gnp.as_garray(Y)
    X_diag = (X*X).sum(axis=1)
    Y_diag = (Y*Y).sum(axis=1)

    return gnp.sqrt(-2 * X.dot(Y.T) + X_diag.reshape(-1,1) + Y_diag.reshape(1,-1) + 1e-3)
    def initParams(self):
        # crude way of random initialization (random seed) for parameters
        import time
        self.seed = int(time.time()) % 100000;
        # for tt in range(self.seed): gp.rand()
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s,self.mbSize)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
            for tt in range(self.seed): gp.rand()

            self.stack = [[ws[0]+.01 * gp.randn(ws[0].shape),ws[1]+.01 * gp.randn(ws[1].shape)] 
                        for ws in self.stack]
    def initParams(self):
	Initialize parameters using 6/sqrt(fanin+fanout)
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        if self.temporalLayer > 0:
            rs = sizes[self.temporalLayer]
            s = gp.sqrt(6)/ rs
            # temporal layer stored at end of stack
            self.stack.append([gp.rand(rs,rs) * 2 * s - s, gp.zeros((2,1))])
        if self.train:
            #TODO why store all deltas?
            #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            #NOTE if a temporal layer is used it's already added to stack so will have a grad
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def nrelu(data, wm, bias, sampling=False):
    """A noisy rectified linear unit.
    suff = gpu.dot(data, wm) + bias
    if sampling:
        sample = suff + (gpu.sqrt(suff.logistic()) * gpu.randn(suff.shape))
        #sample = suff + gpu.randn(suff.shape)
        sample *= (sample > 0)
        sample = None
    suff *= (suff > 0)
    return suff, sample
    def forward_prop(self, X, add_noise=False, compute_loss=False):
        Compute the forward propagation step that maps the input data matrix X
        into the output.
        self.mu = X.mean(axis=0)
        self.sigma = gnp.sqrt(((X - self.mu)**2).mean(axis=0))

        self.X_hat = (X - self.mu) / (self.sigma + 1e-10)
        self.Y = self.X_hat * self.params.gamma + self.params.beta

        return self.Y
def calculate_dt(v, delta_v, N_bodies, alpha):
    a_max = 0.
    for i in range(N_bodies):
        delta_v = gpu.garray(delta_v)
        a = gpu.sum(delta_v[i,:]**2)
        if a > a_max:
            a_max = a
            a_max_index = i
    v = gpu.garray(v)
    v_mag = gpu.sqrt(gpu.sum(v[a_max_index,:]**2))
    return alpha*v_mag/a_max
def rehu_trans(X, mode='ff'):
    """Compute feedforward and backprop for ReHu nonlinearity."""
    if (mode == 'ff'):
        M_quad = (X > 0.0)
        M_line = (X > 0.5)
        M_quad = M_quad - M_line
        F = (M_line * (X - 0.25)) + (M_quad * X**2.0)
    if (mode == 'bp'):
        M_quad = (X['A'] < 0.25)
        M_line = 1.0 - M_quad
        F = (2.0 * M_quad * gp.sqrt(X['A'])) + M_line
        F = F * X['dLdA']
    return F
    def initParams(self):
	Initialize parameters using 6/sqrt(fanin+fanout)
        sizes = [self.inputDim] + self.layerSizes + [self.outputDim]
        scales = [
            gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:])
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        if self.temporalLayer > 0:
            rs = sizes[self.temporalLayer]
            s = gp.sqrt(6) / rs
            # temporal layer stored at end of stack
            self.stack.append([gp.rand(rs, rs) * 2 * s - s, gp.zeros((2, 1))])

        if self.train:
            #TODO why store all deltas?
            #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            #NOTE if a temporal layer is used it's already added to stack so will have a grad
            self.grad = [[gp.empty(w.shape),
                          gp.empty(b.shape)] for w, b in self.stack]
def rehu_trans(X, mode='ff'):
    """Compute feedforward and backprop for ReHu nonlinearity."""
    if (mode == 'ff'):
        M_quad = (X > 0.0)
        M_line = (X > 0.5)
        M_quad = M_quad - M_line
        F = (M_line * (X - 0.25)) + (M_quad * X**2.0)
    if (mode == 'bp'):
        M_quad = (X['A'] < 0.25)
        M_line = 1.0 - M_quad
        F = (2.0 * M_quad * gp.sqrt(X['A'])) + M_line
        F = F * X['dLdA']
    return F
  def output_and_cost(self, epoch, set_name = 'train'):       
      self.timer_logger('output_and_cost {0}'.format(set_name), time.time())  
      self.results['current'] = self.output(self.results['current'])        
      if self.problem == 'regression' and self.clip_values == 1:
          # clip values into the [0,1] range
          self.results['current']  = (self.results['current'] *(self.results['current'] >= 0)) 
          self.results['current'] = (((self.results['current'] < 1)*self.results['current']) + (self.results['current'] > 1))        
      if set_name != 'train':       
          if set_name == 'no_label_test':                  
              if 'prediction_test' not in self.results:
                  if self.problem == 'classification':
                      self.results['prediction_test'] = np.argmax(self.results['current'].as_numpy_array(),axis=1)
                      self.results['prediction_test'] = self.results['current'].as_numpy_array()    
                  if self.problem == 'classification':          
                      self.results['prediction_test'] = np.hstack([self.results['prediction_test'],np.argmax(self.results['current'].as_numpy_array(),axis=1)])  
                      self.results['prediction_test'] = np.vstack([self.results['prediction_test'],self.results['current'].as_numpy_array()])  
          elif set_name == 'cv_predict':
              if self.create_cv_predictions and set_name == 'cv_predict':
                  if 'prediction_cv'not in self.results:                            
                      self.results['prediction_cv'] = self.results['current'].as_numpy_array()
                      self.results['prediction_cv'] = np.vstack([self.results['prediction_cv'],self.results['current'].as_numpy_array()])              
              if self.problem == 'classification':   
                  self.set_error_by_epoch[set_name][epoch] += (np.sum(np.equal(np.argmax(self.results['current'].as_numpy_array(),axis=1),self.batch_y.T)))
                  self.set_error_by_epoch[set_name][epoch] += gpu.sqrt(gpu.sum(((self.results['current']-self.batch_y)**2)*float(self.batch.shape[0]))/float(self.y.shape[1]))                     
              if self.cost == 'auc':
                  if self.problem == 'regression':
                      if set_name + ' roc_auc' not in self.results:
                          self.results[set_name + ' roc_auc'] = ([np.matrix(self.results['current'].as_numpy_array()).T, np.matrix(self.batch_y).T]) 
                          self.results[set_name + ' roc_auc'] = [np.hstack([self.results[set_name + ' roc_auc'][0],np.matrix(self.results['current'].as_numpy_array()).T]),
                                                             np.hstack([self.results[set_name + ' roc_auc'][1],np.matrix(self.batch_y).T])]
                      if set_name + ' roc_auc' not in self.results:
                          self.results[set_name + ' roc_auc'] = ([np.matrix(self.results['current'].as_numpy_array()[:,1]).T, np.matrix(self.batch_y)])                             
                          self.results[set_name + ' roc_auc'] = [np.vstack([np.matrix(self.results[set_name + ' roc_auc'][0]),np.matrix(self.results['current'].as_numpy_array()[:,1]).T]),
                                                             np.vstack([self.results[set_name + ' roc_auc'][1],np.matrix(self.batch_y)])]
      self.timer_logger('output_and_cost {0}'.format(set_name), time.time())  
def euclidSimilar(a, b):
    global gpu
    res = 0
    if gpu == 1:
#        print "gpu"
        a = gnumpy.garray(a)
        b = gnumpy.garray(b)
        c = a - b
        res = gnumpy.sqrt(gnumpy.dot(c, c))
        a = numpy.array(a)
        b = numpy.array(b)
        c = a - b
        res = numpy.sqrt(numpy.dot(c, c))
    return 1.0 / (1 + res)
def rmssd(z, targets, predict=False, error=False, addon=0):
    Root mean sum of squares.
    if predict:
        return z
    n, m = z.shape
    err = z - targets
    per_sample = gpu.sqrt(gpu.sum(err**2, axis=1) + 1e-8)

    if error:
        # rec. error + first deriv
        return gpu.sum(per_sample) / n + addon, err / (
            n * per_sample[:, gpu.newaxis])
        # only return reconstruction error
        return gpu.sum(per_sample) / n + addon
    def bound_weights(self, Wm, wt_bnd):
        """Bound L2 (row-wise) norm of the weights in Wm by wt_bnd.

        This returns a garray if passed a garray, and performs all ops on the
        GPU if that is the case. Otherwise, it returns a numpy array, or if
        something besides an ndarray/garray was passed, it crashes (probably).
        EPS = 0.00000001
        # Compute L2 norm of weights inbound to each node in this layer
        w_norms = gp.sqrt(gp.sum(Wm**2,axis=1) + EPS)
        # Compute scales based on norms and the upperbound set by wt_bnd
        w_scales = wt_bnd / w_norms
        mask = (w_scales < 1.0)
        w_scales = (w_scales * mask) + (1.0 - mask)
        w_scales = w_scales[:,gp.newaxis]
        # Rescale weights to meet the bound set by wt_bnd
        Wm = Wm * w_scales
        return Wm
    def bound_weights(self, Wm, wt_bnd):
        """Bound L2 (row-wise) norm of the weights in Wm by wt_bnd.

        This returns a garray if passed a garray, and performs all ops on the
        GPU if that is the case. Otherwise, it returns a numpy array, or if
        something besides an ndarray/garray was passed, it crashes (probably).
        EPS = 0.00000001
        # Compute L2 norm of weights inbound to each node in this layer
        w_norms = gp.sqrt(gp.sum(Wm**2, axis=1) + EPS)
        # Compute scales based on norms and the upperbound set by wt_bnd
        w_scales = wt_bnd / w_norms
        mask = (w_scales < 1.0)
        w_scales = (w_scales * mask) + (1.0 - mask)
        w_scales = w_scales[:, gp.newaxis]
        # Rescale weights to meet the bound set by wt_bnd
        Wm = Wm * w_scales
        return Wm
    def computeStat(self):
        print 'Computing stats (mean and std)...'
        means = gp.zeros((self.numbatches, self.dim))
        variances = gp.zeros((self.numbatches, self.dim))
        i = 0
        while True:
            batch = self.cache.getOneBatch()
            if batch == None:
            means[i] = batch.mean(axis=0)
            variances[i] = gp.std(batch, axis=0)**2
            i += 1
        assert (i == self.numbatches)
        mean = means.mean(axis=0)
        std = gp.sqrt(variances.mean(axis=0) + gp.std(means, axis=0)**2)
        mean_std = std.mean()
        std += (std == 0.0) * mean_std

        print 'Finish stats computing'
        return mean, std + 1e-10
def unigram_partition(data_path, num_ensembles, model_name, method = 'none', train = True, random_training_order = False, reverse_order = False):
		algo_name = 'unigram' + '_' + 'partition'

		raw_data = reader.ptb_raw_data(data_path)
		train_data, valid_data, test_data, _, word_to_id = raw_data

		if reverse_order:
				algo_name = 'reverse ' + algo_name

		eos_id = word_to_id['<eos>']
		case_weight_length = len(train_data)-1
		train_case_weights = np.repeat(1.0/case_weight_length, case_weight_length).tolist()

		train_sentence_list = reader.get_sentence_list(train_data, eos_id, reverse_order)
		if random_training_order:
				#train_sentence_list = reader.get_sentence_list(train_data, eos_id)
				perm = range(len(train_sentence_list))
				train_data = []
				for idx in perm:
						train_data += train_sentence_list[idx]
				train_sentence_list = reader.get_sentence_list(train_data, eos_id, reverse_order)
		num_sent = len(train_sentence_list)
		train_sentence_weights = np.repeat(1.0/num_sent, num_sent).tolist()

		new_train_data = train_data
		FLAGS.model = model_name

		config = get_config()
		eval_config = get_config()
		eval_config.batch_size = 1
		eval_config.num_steps = 1

		alpha_t_list = []
		full_test_set_logits = []
		for i in range(len(test_data)-1):
		sentence_starters = []
		id_to_sentence_num_dict = {}
		for i in range(num_sent):
				if reverse_order:
						desired_id = train_sentence_list[i][-1]
						desired_id = train_sentence_list[i][0]
				if desired_id in id_to_sentence_num_dict:
						id_to_sentence_num_dict[desired_id] = [i]

		id_to_model = {}
		for idx in sentence_starters:
				id_to_model[idx] = [1]

		id_to_weight = {}

		gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5)

		for iii in range(num_ensembles):
				with tf.Graph().as_default(), tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as session:
						initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

						with tf.variable_scope("model", reuse=None, initializer=initializer):
								sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
								m = PTBModel(is_training=True, config=config)

						with tf.variable_scope("model", reuse=True, initializer=initializer):
								mvalid = PTBModel(is_training=False, config=config)
								mtest = PTBModel(is_training=False, config=eval_config)
								m2 = PTBModel(is_training=False, config=eval_config)

						saver = tf.train.Saver()
						if iii > 0:
								np.savetxt(checkpoint_dir + 'test_set_probs_no_alpha.out', np.squeeze(test_set_probs_no_alpha), delimiter = ',')

						if random_training_order:
								new_folder = 'random training order ' + algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1)
								new_folder = algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1)
						checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/'
						if not os.path.exists(checkpoint_dir):

						if iii > 0:
								for k,v in id_to_sentence_num_dict.items():
										total_wt = 0
										num_sentences = len(v)
										for sent_idx in v:
												total_wt += train_sentence_weights[sent_idx]
										id_to_weight[k] = total_wt/num_sentences

								sorted_id_to_weight = sorted(id_to_weight.items(), key = operator.itemgetter(1), reverse = True)
								np.savetxt(checkpoint_dir + 'sorted_id_to_weight', sorted_id_to_weight, delimiter = ',')
								new_train_data = []
								i = 0
								sent_included = 0                      
								while sent_included < np.floor(num_sent/2):
										start_key = sorted_id_to_weight[i][0]
										sentence_additions = id_to_sentence_num_dict[start_key]
										for idx in sentence_additions:
												new_train_data += train_sentence_list[idx]
												sent_included += 1
										id_to_model[start_key].append(iii + 1)
										i += 1


						if train:

								np.savetxt(checkpoint_dir + 'train_case_weights.out', train_case_weights, delimiter = ',')
								np.savetxt(checkpoint_dir + 'train_sentence_weights.out', train_sentence_weights, delimiter = ',')

								for i in range(config.max_max_epoch):
										lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
										m.assign_lr(session, config.learning_rate * lr_decay)

										print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
										train_perplexity = run_epoch(session, m, new_train_data, m.train_op, verbose=False)
										print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
										valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op())
										print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
										if (i+1) % 5 == 0 or (i+1) == config.max_max_epoch:
												saver.save(session, checkpoint_dir + 'model.ckpt', global_step = i+1)
								ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
								if ckpt and ckpt.model_checkpoint_path:
										saver.restore(session, ckpt.model_checkpoint_path)

						if train:

								case_scores = output_training_set_error_for_boosting(session, m2, train_data, tf.no_op())

								score = sum(case_scores)
								norm = len(case_scores) * -1 * gpu.log(float(1.0) / eval_config.vocab_size)
								epsilon_t =  (1 - (norm - score) / norm) / 2.0
								alpha_t = 0.5 * gpu.log((1 - epsilon_t)/ epsilon_t)

								if iii == 0:
										shutil.rmtree('simple-examples/ckpt/' + 'random training order ' + algo_name + '_' + model_name + '/' + 'alpha_t.out', ignore_errors = True)

								with open('simple-examples/ckpt/' + 'random training order ' + algo_name + '_' + model_name + '/' + 'alpha_t.out', 'ab') as f:
								train_case_weights = gpu.sqrt((1 - epsilon_t)/ epsilon_t) * np.multiply(train_case_weights, np.asarray(case_scores))
								train_case_weights = np.ravel(normalize(np.asarray(train_case_weights).reshape(1,-1), norm = 'l1'))
								if method == 'stddev':
										new_train_case_weights = reject_outliers(train_case_weights)
								elif method == 'sqrt':
										new_train_case_weights = sqrt_norm(train_case_weights)
										new_train_case_weights = train_case_weights
								start_idx = 0

								for i in range(len(train_sentence_list)):
										this_sentence_length = len(train_sentence_list[i])
										sentence_tokens = [v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)]
										if len(sentence_tokens) == 0:
												this_sentence_weights[i] = 0
												train_sentence_weights[i] = np.mean([v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)])

										start_idx += this_sentence_length

								train_sentence_weights = np.ravel(normalize(np.asarray(train_sentence_weights).reshape(1,-1), norm = 'l1'))

								for k,v in id_to_sentence_num_dict.items():
										total_wt = 0
										num_sentences = len(v)
										for sent_idx in v:
												total_wt += train_sentence_weights[sent_idx]
										id_to_weight[k] = total_wt/num_sentences

						test_set_probs = output_test_set_probs(session, mtest, test_data, tf.no_op(), partition = True)
						test_set_probs_no_alpha = test_set_probs
						np.savetxt(checkpoint_dir + 'test_set_probs_no_alpha.out', np.squeeze(test_set_probs_no_alpha), delimiter = ',')

						train_set_probs = output_test_set_probs(session, m2, train_data, tf.no_op(), partition = True)
						train_set_probs_no_alpha = train_set_probs
						np.savetxt(checkpoint_dir + 'train_set_probs_no_alpha.out', np.squeeze(train_set_probs_no_alpha), delimiter = ',')            
						test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
						print("Test Perplexity: %.3f" % test_perplexity)
		if random_training_order:
				new_folder = 'random training order ' + algo_name + '_' + model_name
				new_folder = algo_name + '_' + model_name
		checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/'
		with open(checkpoint_dir + 'id_to_model.out', 'w') as f:
				for k,v in id_to_model.items():
						f.write(str(k) + ',' + ','.join(str(id_to_model[k])) + '\n')

		with open(checkpoint_dir + 'id_to_sent_num.out', 'w') as ff:
				for k,v in id_to_sentence_num_dict.items():
						ff.write(str(k) + ',' + ','.join(str(id_to_sentence_num_dict[k])) + '\n')

		print('Test PPL: ' + str(evaluate_unigram_partition(data = test_data, batch_size = 1, num_steps = 1, num_ensembles = num_ensembles, eos_id = eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'test_set_probs_no_alpha.out')))
		print('Train PPL: ' + str(evaluate_unigram_partition(data = train_data, batch_size = 1, num_steps = 1, num_ensembles = num_ensembles, eos_id = eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'train_set_probs_no_alpha.out')))
def columnRMS(W):
    return gnp.sqrt(gnp.mean(W*W,axis=0))
def sqrt(x):
    if is_np(x):
        return np.sqrt(x)
        return gp.sqrt(x)
 def task_loss(self, Y, Z, A=None, task_loss_fn=None):
     # root mean square error
     if task_loss_fn == None:
         return gnp.sqrt(((Y-Z)**2).mean())
         return task_loss_fn(Y, Z, A)
 def sampleStates(self, acts):
     if self.krizNoise:
         return self.activate(acts + gnp.randn(*acts.shape))
     tiny = 1e-30
     stddev = gnp.sqrt(acts.sigmoid() + tiny)
     return self.activate( acts + stddev*gnp.randn(*acts.shape) )
    def compute_kernel_matrix(self, x):
        x = x if isinstance(x, gnp.garray) else gnp.garray(x)
        x_norm = gnp.sqrt((x**2).sum(axis=1))
        x_norm = x_norm[:, gnp.newaxis] + x_norm[gnp.newaxis, :] + 1e-20

        return x.dot(x.T) / x_norm
def ABISS(data_path, num_ensembles, model_name, method = 'stddev', train = True, random_training_order = False):
		algo_name = method + ' ' + 'ABISS'

		raw_data = reader.ptb_raw_data(data_path)
		train_data, valid_data, test_data, _, word_to_id = raw_data

		eos_id = word_to_id['<eos>']
		case_weight_length = len(train_data)-1
		train_case_weights = np.repeat(1.0/case_weight_length, case_weight_length).tolist()

		train_sentence_list = reader.get_sentence_list(train_data, eos_id)
		if random_training_order:
				perm = range(len(train_sentence_list))
				train_data = []
				for idx in perm:
						train_data += train_sentence_list[idx]
				train_sentence_list = reader.get_sentence_list(train_data, eos_id)
		num_sent = len(train_sentence_list)
		train_sentence_weights = np.repeat(1.0/num_sent, num_sent).tolist()

		new_train_data = reader.weighted_sentence_selection(train_sentence_list, train_sentence_weights, random_training_order)
		FLAGS.model = model_name

		config = get_config()
		eval_config = get_config()
		eval_config.batch_size = 1
		eval_config.num_steps = 1

		alpha_t_list = []
		full_test_set_logits = []
		for i in range(len(test_data)-1):
		gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.33)

		for iii in range(num_ensembles):
				with tf.Graph().as_default(), tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as session:
						initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

						with tf.variable_scope("model", reuse=None, initializer=initializer):
								sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
								m = PTBModel(is_training=True, config=config)

						with tf.variable_scope("model", reuse=True, initializer=initializer):
								mvalid = PTBModel(is_training=False, config=config)
								mtest = PTBModel(is_training=False, config=eval_config)
								#config.batch_size = 1
								m2 = PTBModel(is_training=False, config=eval_config)

						saver = tf.train.Saver()
						if random_training_order:
								new_folder = 'random training order' + algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1)
								new_folder = algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1)
						checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/'
						if not os.path.exists(checkpoint_dir):

						if train:
								np.savetxt(checkpoint_dir + 'train_case_weights.out', train_case_weights, delimiter = ',')
								np.savetxt(checkpoint_dir + 'train_sentence_weights.out', train_sentence_weights, delimiter = ',')

								for i in range(config.max_max_epoch):
										lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
										m.assign_lr(session, config.learning_rate * lr_decay)

										print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
										train_perplexity = run_epoch(session, m, new_train_data, m.train_op, verbose=False)
										print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
										valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op())
										print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
										if (i+1) % 5 == 0 or (i+1) == config.max_max_epoch:
												saver.save(session, checkpoint_dir + 'model.ckpt', global_step = i+1)
								ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
								if ckpt and ckpt.model_checkpoint_path:
										saver.restore(session, ckpt.model_checkpoint_path)

						case_scores = output_training_set_error_for_boosting(session, m2, train_data, tf.no_op())

						score = sum(case_scores)
						norm = len(case_scores) * -1 * gpu.log(float(1.0) / eval_config.vocab_size)
						epsilon_t =  (1 - (norm - score) / norm) / 2.0
						alpha_t = 0.5 * gpu.log((1 - epsilon_t)/ epsilon_t)

						if iii == 0:
								shutil.rmtree('simple-examples/ckpt/' + algo_name + '_' + model_name + '/' + 'alpha_t.out', ignore_errors = True)

						with open('simple-examples/ckpt/' + algo_name + '_' + model_name + '/' + 'alpha_t.out', 'ab') as f:
						train_case_weights = gpu.sqrt((1 - epsilon_t)/ epsilon_t) * np.multiply(train_case_weights, np.asarray(case_scores))
						train_case_weights = np.ravel(normalize(np.asarray(train_case_weights).reshape(1,-1), norm = 'l1'))

						if method == 'stddev':
								new_train_case_weights = reject_outliers(train_case_weights)
						elif method == 'sqrt':
								new_train_case_weights = sqrt_norm(train_case_weights)
						start_idx = 0

						for i in range(len(train_sentence_list)):
								this_sentence_length = len(train_sentence_list[i])
								sentence_tokens = [v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)]
								if len(sentence_tokens) == 0:
										this_sentence_weights[i] = 0
										train_sentence_weights[i] = np.mean([v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)])

								start_idx += this_sentence_length

						train_sentence_weights = np.ravel(normalize(np.asarray(train_sentence_weights).reshape(1,-1), norm = 'l1'))

						new_train_data = reader.weighted_sentence_selection(train_sentence_list, train_sentence_weights, random_training_order)

						test_set_probs = output_test_set_probs(session, mtest, test_data, tf.no_op())

						for i in range(len(test_set_probs)):
								test_set_probs[i] = test_set_probs[i] * alpha_t
								full_test_set_logits[i] += test_set_probs[i]
						test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
						print("Test Perplexity: %.3f" % test_perplexity)
		alpha_t_sum = np.sum(alpha_t_list)
		for i in range(len(full_test_set_logits)):
				full_test_set_logits[i] = full_test_set_logits[i] / alpha_t_sum

		ensemble_perplexity = classify_ensemble(test_data, full_test_set_logits, 1, 1)
    def output_and_cost(self, epoch, set_name='train'):
        self.timer_logger('output_and_cost {0}'.format(set_name), time.time())

        self.results['current'] = self.output(self.results['current'])
        if self.problem == 'regression' and self.clip_values == 1:
            # clip values into the [0,1] range
            self.results['current'] = (self.results['current'] *
                                       (self.results['current'] >= 0))
            self.results['current'] = ((
                (self.results['current'] < 1) * self.results['current']) +
                                       (self.results['current'] > 1))

        if set_name != 'train':
            if set_name == 'no_label_test':
                if 'prediction_test' not in self.results:
                    if self.problem == 'classification':
                        self.results['prediction_test'] = np.argmax(
                            self.results['current'].as_numpy_array(), axis=1)
                        self.results['prediction_test'] = self.results[
                    if self.problem == 'classification':
                        self.results['prediction_test'] = np.hstack([
                        self.results['prediction_test'] = np.vstack([
            elif set_name == 'cv_predict':
                if self.create_cv_predictions and set_name == 'cv_predict':
                    if 'prediction_cv' not in self.results:
                        self.results['prediction_cv'] = self.results[
                        self.results['prediction_cv'] = np.vstack([
                if self.problem == 'classification':
                    self.set_error_by_epoch[set_name][epoch] += (np.sum(
                                      axis=1), self.batch_y.T)))
                    self.set_error_by_epoch[set_name][epoch] += gpu.sqrt(
                        gpu.sum(((self.results['current'] - self.batch_y)**2) *
                                float(self.batch.shape[0])) /

                if self.cost == 'auc':
                    if self.problem == 'regression':
                        if set_name + ' roc_auc' not in self.results:
                            self.results[set_name + ' roc_auc'] = ([
                            self.results[set_name + ' roc_auc'] = [
                                    self.results[set_name + ' roc_auc'][0],
                                    self.results[set_name + ' roc_auc'][1],

                        if set_name + ' roc_auc' not in self.results:
                            self.results[set_name + ' roc_auc'] = ([
                                    [:, 1]).T,
                            self.results[set_name + ' roc_auc'] = [
                                    np.matrix(self.results[set_name +
                                                           ' roc_auc'][0]),
                                              as_numpy_array()[:, 1]).T
                                    self.results[set_name + ' roc_auc'][1],

        self.timer_logger('output_and_cost {0}'.format(set_name), time.time())