コード例 #1
0
ファイル: activation.py プロジェクト: magic2du/dlnn
def maxout_func(n_out, last_start, pool_size, rectify, lin_output):
    tmp_output = lin_output[:,0:last_start+1:pool_size]
    for i in range(1, pool_size):
        cur = lin_output[:,i:last_start+i+1:pool_size]
        tmp_output = T.maximum(cur, tmp_output)
    if rectify:
        self.tmp_output = T.maximum(0, self.tmp_output)
コード例 #2
0
ファイル: policy_rnn.py プロジェクト: kyunghyuncho/gym
    def grad_init(self):
        #self.mov_std = theano.shared(numpy.float32(1.), 'std')

        rewards_ = self.rewards[0]
        mean_rewards = rewards_.mean()
        var_rewards = rewards_.var()

        pp = self.params.values()

        #mean_rewards = (self.mask * self.rewards).sum(-1, keepdims=True) / tensor.maximum(1., self.mask.sum(-1, keepdims=True))
        ##centered_rewards = self.rewards - self.vapprox.v[:,:,0] - mean_rewards
        centered_rewards = rewards_ - mean_rewards - self.vapprox.v[:,0] 
        #mean2_rewards = (self.mask * (self.rewards ** 2)).sum(-1, keepdims=True) / tensor.maximum(1., self.mask.sum(-1, keepdims=True))
        #var_rewards = mean2_rewards - (mean_rewards ** 2)
        scaled_rewards = centered_rewards  / tensor.maximum(1., tensor.sqrt(tensor.maximum(0., var_rewards)))

        logprob = 0.
        reg = 0.
        for oi in xrange(self.n_out):
            labs = self.actions[:,:,oi].flatten()
            labs_idx = tensor.arange(labs.shape[0]) * self.out_dim + labs
            logprob = logprob + ((self.mask * 
                                  tensor.log(self.pi[oi].flatten()+1e-6)[labs_idx]
                                  .reshape([self.actions.shape[0], 
                                            self.actions.shape[1]])).sum(0))
            reg = reg - (self.pi[oi] * tensor.log(self.pi[oi]+1e-6)).sum(-1).sum(0)

        self.cost = -tensor.mean(scaled_rewards * logprob + self.reg_c * reg)
        self.grads = tensor.grad(self.cost, wrt=pp)
コード例 #3
0
    def penalty(self):

        penTerms = []
        objects = self.scene_obj.shapes
        for i in xrange(len(objects)-1):

            obj1 = objects[i] 
            for j in xrange(i,len(objects)):

                obj2 = objects[j]
               
                #Distance check between objects
                center1 = obj1.o2w.m[:,3]
                center2 = obj2.o2w.m[:,3]
                radius1 = T.maximum(obj1.o2w.m[0,0], obj1.o2w.m[1,1])
                radius2 = T.maximum(obj2.o2w.m[0,0], obj2.o2w.m[1,1])
            
                max_rad = T.maximum(radius1, radius2)
                
                #TODO remake it for batch
                dist = T.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2)
                penflag = T.switch(dist < max_rad, 1, 0)
        
                #Computing the overlapping area of two circle (is not working...)
                #R = radius1 
                #r = radius2
                #A = r**2 * T.arccos( (dist**2 + r**2 - R**2)/(2*dist*r)) \
                #        + R**2 * T.arccos( (dist**2-R**2 +r**2) / (2*dist*R) )\
                #        - 0.5 * T.sqrt((-dist + r +R)*(dist+r+R)*(dist-r+R)*(dist+r+R))
        
                penTerms = T.sum(penflag * (np.pi * max_rad**2)*2)# + (1-penflag) * A )

        return penTerms 
コード例 #4
0
ファイル: models.py プロジェクト: medusaGit/iaf
 def f_encode_decode(w, train=True):
     
     results = {}
     
     h = x_enc(_x - .5, w)
     
     obj_kl = G.sharedf(0.)
     
     # bottom-up encoders
     for i in range(len(depths)):
         for j in range(depths[i]):
             h = layers[i][j].up(h, w)
     
     # top-level activations
     h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths)))
     
     # top-down priors, posteriors and decoders
     for i in list(reversed(range(len(depths)))):
         for j in list(reversed(range(depths[i]))):
             h, kl = layers[i][j].down_q(h, train, w)
             kl_sum = kl.sum(axis=(1,2,3))
             results['cost_z'+str(i).zfill(3)+'_'+str(j).zfill(3)] = kl_sum
             # Constraint: Minimum number of bits per featuremap, averaged across minibatch
             if kl_min > 0:
                 kl = kl.sum(axis=(2,3)).mean(axis=0,dtype=G.floatX)
                 obj_kl += T.maximum(np.asarray(kl_min,G.floatX), kl).sum(dtype=G.floatX)
             else:
                 obj_kl += kl_sum
     
     output = x_dec(x_dec_nl(h, w), w)
     
     # empirical distribution
     if px == 'logistic':
         mean_x = T.clip(output+.5, 0, 1)
         logsd_x = 0*mean_x + w['logsd_x']
         obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/256., _x).logp
         #obj_z = T.printing.Print('obj_z')(obj_z)
         obj = obj_logpx - obj_kl
         # Compute the bits per pixel
         obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32')
         
         #if not '__init' in w:
         #    raise Exception()
         
     
     elif px == 'bernoulli':
         prob_x = T.nnet.sigmoid(output)
         prob_x = T.maximum(T.minimum(prob_x, 1-1e-7), 1e-7)
         #prob_x = T.printing.Print('prob_x')(prob_x)
         obj_logpx = N.rand.bernoulli(prob_x, _x).logp
         
         #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz)
         #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz)
         #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx)
         obj = obj_logpx - obj_kl
         #obj = T.printing.Print('obj')(obj)
         
     results['cost_x'] = -obj_logpx
     results['cost'] = -obj
     return results
コード例 #5
0
ファイル: NetworkTwoDLayer.py プロジェクト: rwth-i6/returnn
  def __init__(self, factor=numpy.sqrt(2), decay=1.0, min_factor=None, padding=False, **kwargs):
    super(ConvFMPLayer, self).__init__(**kwargs)
    if min_factor is None:
      min_factor = factor
    factor = T.maximum(factor * (decay ** self.network.epoch), numpy.float32(min_factor))
    sizes_raw = self.source.output_sizes

    # handle size problems
    if not padding:
      padding = T.min(self.source.output_sizes / factor) <= 0
      padding = theano.printing.Print(global_fn=maybe_print_pad_warning)(padding)

    fixed_sizes = T.maximum(sizes_raw, T.cast(T.as_tensor(
      [factor + self.filter_height - 1, factor + self.filter_width - 1]), 'float32'))
    sizes = ifelse(padding, fixed_sizes, sizes_raw)
    X_size = T.cast(T.max(sizes, axis=0), "int32")

    def pad_fn(x_t, s):
      x = T.alloc(numpy.cast["float32"](0), X_size[0], X_size[1], self.X.shape[3])
      x = T.set_subtensor(x[:s[0], :s[1]], x_t[:s[0], :s[1]])
      return x

    fixed_X, _ = theano.scan(pad_fn, [self.X.dimshuffle(2, 0, 1, 3), T.cast(sizes_raw, "int32")])
    fixed_X = fixed_X.dimshuffle(1, 2, 0, 3)
    self.X = ifelse(padding, T.unbroadcast(fixed_X, 3), self.X)

    conv_out = CuDNNConvHWBCOpValidInstance(self.X, self.W, self.b)
    conv_out_sizes = self.conv_output_size_from_input_size(sizes)
    self.output, self.output_sizes = fmp(conv_out, conv_out_sizes, T.cast(factor,'float32'))
コード例 #6
0
    def lp_norm(self, n, k, r, c, z):
        '''
        Lp = ( 1/n * sum(|x_i|^p, 1..n))^(1/p) where p = 1 + ln(1+e^P)
        :param n:
        :param k:
        :param r:
        :param c:
        :param z:
        :return:
        '''
        ds0, ds1 = self.pool_size
        st0, st1 = self.stride
        pad_h = self.pad[0]
        pad_w = self.pad[1]

        row_st = r * st0
        row_end = T.minimum(row_st + ds0, self.img_rows)
        row_st = T.maximum(row_st, self.pad[0])
        row_end = T.minimum(row_end, self.x_m2d + pad_h)

        col_st = c * st1
        col_end = T.minimum(col_st + ds1, self.img_cols)
        col_st = T.maximum(col_st, self.pad[1])
        col_end = T.minimum(col_end, self.x_m1d + pad_w)

        Lp = T.pow(
                T.mean(T.pow(
                        T.abs_(T.flatten(self.y[n, k, row_st:row_end, col_st:col_end], 1)),
                        1 + T.log(1 + T.exp(self.P))
                )),
                1 / (1 + T.log(1 + T.exp(self.P)))
        )

        return T.set_subtensor(z[n, k, r, c], Lp)
コード例 #7
0
ファイル: update_func.py プロジェクト: cauchyturing/DeepMONA
 def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.):
     grads = T.grad(cost, params)
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1e-4
     updates = []
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - learning_rate * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * learning_rate * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     return updates
コード例 #8
0
ファイル: tsne.py プロジェクト: RuinCakeLie/breze
def build_loss(embeddings):
    """Return a pair (loss, p) given a theano shared variable representing the
    `embeddings`.

    `loss` is a theano variable for the loss. `p` is a symbolic variable
    representing the target neighbour probabilities on which the loss depends.
    """
    # Probability that two points are neighbours in the embedding space.
    emb_dists = distance_matrix(embeddings)
    emb_top = zero_diagonal(1 / (1 + emb_dists))
    emb_bottom = emb_top.sum(axis=0)
    q = emb_top / emb_bottom

    # Incorrect normalization which does not matter since we normalize p i
    # the same way.
    q /= q.sum()
    q = T.maximum(q, 1E-12)

    p_ji_var = T.matrix('neighbour_probabilities')
    p_ji_var.tag.test_value = np.random.random(
            (10, 10)).astype(theano.config.floatX)
    p_ji_var_floored = T.maximum(p_ji_var, 1E-12)

    # t-distributed stochastic neighbourhood embedding loss.
    loss = (p_ji_var * T.log(p_ji_var_floored / q)).sum()

    return loss, p_ji_var
コード例 #9
0
ファイル: policy_ff.py プロジェクト: kyunghyuncho/gym
    def grad_init(self):
        mask_ = self.mask.flatten()
        rewards_ = self.rewards.flatten()
        actions_ = self.actions.reshape([self.actions.shape[0]*self.actions.shape[1],-1])

        #self.mov_std = theano.shared(numpy.float32(1.), 'std')

        pp = self.params.values()
        mean_rewards = (mask_ * rewards_).sum(-1, keepdims=True) / tensor.maximum(1., mask_.sum(-1, keepdims=True))
        centered_rewards = rewards_ - self.vapprox.v[:,0] - mean_rewards
        mean2_rewards = (mask_ * (rewards_ ** 2)).sum(-1, keepdims=True) / tensor.maximum(1., mask_.sum(-1, keepdims=True))
        var_rewards = mean2_rewards - (mean_rewards ** 2)
        scaled_rewards = centered_rewards  / tensor.maximum(1., tensor.sqrt(tensor.maximum(0., var_rewards)))
        #scaled_rewards = centered_rewards

        logprob = 0.
        reg = 0.
        for oi in xrange(self.n_out):
            labs = actions_[:,oi].flatten()
            labs_idx = tensor.arange(labs.shape[0]) * self.out_dim + labs
            logprob = logprob + (mask_ * tensor.log(self.pi[oi].flatten()+1e-6)[labs_idx])
            reg = reg - (self.pi[oi] * tensor.log(self.pi[oi]+1e-6)).sum(-1).sum(0)

        self.cost = -tensor.mean(scaled_rewards * logprob + self.reg_c * reg)
        self.grads = tensor.grad(self.cost, wrt=pp)
コード例 #10
0
ファイル: lstmbb.py プロジェクト: bloodmage/libref
 def __init__(self, input):
     #A 3in1 maxpooling
     self.output_shape = input.output_shape[0]/2, input.output_shape[1]
     self.origlayer = input
     self.output = input.output[::2]
     self.output = T.set_subtensor(self.output[:input.output.shape[0]/2], T.maximum(self.output[:input.output.shape[0]/2], input.output[1::2]))
     self.output = T.set_subtensor(self.output[1:], T.maximum(self.output[1:], input.output[1:-1:2]))
コード例 #11
0
ファイル: MLMNN.1.64.py プロジェクト: PiscesDream/Lab_MMAPM
    def _local_error(self, targetM, i):
        pull_error = 0.
        ivectors = self._x[:, i, :][self._neighborpairs[:, 0]]
        jvectors = self._x[:, i, :][self._neighborpairs[:, 1]]
        diffv = ivectors - jvectors
        pull_error = linalg.trace(diffv.dot(targetM).dot(diffv.T))

        push_error = 0.0
        ivectors = self._x[:, i, :][self._set[:, 0]]
        jvectors = self._x[:, i, :][self._set[:, 1]]
        lvectors = self._x[:, i, :][self._set[:, 2]]
        diffij = ivectors - jvectors
        diffil = ivectors - lvectors
        lossij = diffij.dot(targetM).dot(diffij.T)
        lossil = diffil.dot(targetM).dot(diffil.T)
        mask = T.neq(self._y[self._set[:, 0]], self._y[self._set[:, 2]])
        push_error = linalg.trace(mask*T.maximum(lossij - lossil + 1, 0))

        self.zerocount = T.eq(linalg.diag(mask*T.maximum(lossij - lossil + 1, 0)), 0).sum()

#       print np.sqrt((i+1.0)/self.M)
#       pull_error = pull_error * np.sqrt((i+1.0)/self.M)
#       push_error = push_error * np.sqrt((i+1.0)/self.M)

        return pull_error, push_error 
コード例 #12
0
    def penality(self):

        penTerms = []
        objects = self.scene_obj.shapes
        for i in xrange(len(objects)-1):

            obj1 = objects[i] 
            for j in xrange(i,len(objects)):

                obj2 = objects[j]
               
                #Distance check between objects
                center1 = obj1.o2w.m[:,3]
                center2 = obj2.o2w.m[:,3]
                radius1 = T.maximum(obj1.o2w.m[0,0], obj1.o2w.m[1,1])
                radius2 = T.maximum(obj2.o2w.m[0,0], obj2.o2w.m[1,1])
            
                max_rad = T.maximum(radius1, radius2)
                
                #TODO remake it for batch
                dist = T.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2)
                penflag = T.switch(dist < max_rad, 1, 0)
        
                penTerms = T.sum(penflag * (np.pi * max_rad**2)*2)

        return penTerms 
コード例 #13
0
ファイル: idxs_vals_rnd.py プロジェクト: gwtaylor/hyperopt
 def infer_shape(self, node, ishapes):
     mus_shape, prior_mu, prior_sigma = ishapes
     return [
         (tensor.maximum(1, mus_shape[0]),),
         (tensor.maximum(1, mus_shape[0]),),
         (tensor.maximum(1, mus_shape[0]),),
     ]
コード例 #14
0
ファイル: parse.py プロジェクト: amoliu/autosub
    def getTrainingFunc2(self):
        input = T.dmatrix()
        target = T.dvector()
        learning_rate = T.scalar()
        
        y = input
        for i in xrange(0, self.n_layers-1):
            y = T.maximum(0.0, T.dot(y, self.params[i*3]) + self.params[i*3+1] )
            y = y*self.theano_rng.binomial(y.shape, 1, 0.5)
        
        y = T.maximum(0, T.dot(y, self.params[(self.n_layers-1)*3]) + self.params[(self.n_layers-1)*3+1] )
        
        y = T.squeeze(y.T)
        #y = T.dot(y, self.params[-1])
        diff = y - target
        #regulator = theano.printing.Print('norm:')(T.sum(abs(y))*alpha)
        #L = theano.printing.Print('L:')(T.sum(diff*diff) + regulator)
        L = T.sum(diff*diff) #- target*T.log(y) - (1-target)*T.log(1-y)
        
        gparam = T.grad(L, [ self.params[i] for i in xrange(len(self.params)) if i%3 != 2 ])

        updates = {}
        for i,p,g,m in zip(xrange(len(gparam)),[ self.params[i] for i in xrange(len(self.params)) if i%3 != 2 ], gparam, [ self.moments[i] for i in xrange(len(self.moments)) if i%3 != 2 ]):
            if i%2 == 0:
                updates[m] = 0.9*m - learning_rate*0.0005*p - learning_rate*g        
            else:
                updates[m] = 0.9*m - learning_rate*g
            updates[p] = p + m

        train_func = theano.function( inputs = [input, target, learning_rate], outputs=[L,y], updates= updates)
        return train_func
コード例 #15
0
ファイル: NetworkOutputLayer.py プロジェクト: chagge/returnn
  def cost(self):
    known_grads = None
    xd = self.z.reshape((self.z.shape[0]*self.z.shape[1],self.z.shape[2]))
    epsilon = numpy.float32(1e-10)
    # cross-entropy
    nll, _ = T.nnet.crossentropy_softmax_1hot(x=xd[self.i], y_idx=self.y_data_flat[self.i])
    ce = T.sum(nll)
    # entropy
    def entropy(p, axis=None):
      if self.use_max and axis is not None:
        q = p.dimshuffle(axis, *(range(axis) + range(axis+1,p.ndim)))
        #return -T.mean(T.log(T.maximum(T.max(q,axis=0),epsilon)))
        return -T.mean(T.max(q,axis=0)+epsilon) + T.log(T.cast(p.shape[axis],'float32'))
      else:
        return -T.mean(p*T.log(p+epsilon)) + T.log(T.cast(p.shape[axis],'float32'))
    ez = T.exp(self.z) * T.cast(self.index.dimshuffle(0,1,'x').repeat(self.z.shape[2],axis=2), 'float32')
    et = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=0,keepdims=True)),axis=0)
    eb = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=1,keepdims=True)),axis=1)
    ed = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=2,keepdims=True)),axis=2)
    # maximize entropy across T and B and minimize entropy across D
    e = self.e_d * ed - (self.e_t * et + self.e_b * eb) / numpy.float32(self.e_t + self.e_b)

    import theano.ifelse
    if self.train_flag:
      return theano.ifelse.ifelse(T.cast(self.xflag,'int8'),e,ce), known_grads
    else:
      return ce, known_grads
コード例 #16
0
ファイル: ConvLayer.py プロジェクト: shuuki4/2015-2-ML
	def __init__(self, input, input_shape, filter_shape, border_mode="valid") :
		# input : theano symbolic variable of input, 4D tensor
		# input_shape : shape of input / (minibatch size, input channel num, image height, image width)
		# filter_shape : shape of filter / (# of new channels to make, input channel num, filter height, filter width)

		# initialize W (weight) randomly
		rng = np.random.RandomState(int(time.time()))
		w_bound = math.sqrt(filter_shape[1] * filter_shape[2] * filter_shape[3])
		self.W1 = theano.shared(np.asarray(rng.uniform(low=-1.0/w_bound, high=1.0/w_bound, size=filter_shape), dtype=theano.config.floatX), name='W', borrow=True)
		self.W2 = theano.shared(np.asarray(rng.uniform(low=-1.0/w_bound, high=1.0/w_bound, size=filter_shape), dtype=theano.config.floatX), name='W', borrow=True)
		self.W3 = theano.shared(np.asarray(rng.uniform(low=-1.0/w_bound, high=1.0/w_bound, size=filter_shape), dtype=theano.config.floatX), name='W', borrow=True)
		
		# initialize b (bias) with zeros
		self.b1 = theano.shared(np.asarray(np.zeros(filter_shape[0],), dtype=theano.config.floatX), name='b', borrow=True)
		self.b2 = theano.shared(np.asarray(np.zeros(filter_shape[0],), dtype=theano.config.floatX), name='b', borrow=True)
		self.b3 = theano.shared(np.asarray(np.zeros(filter_shape[0],), dtype=theano.config.floatX), name='b', borrow=True)

		# convolution & sigmoid calculation
		#self.conv_out = conv.conv2d(input, self.W, image_shape=input_shape, filter_shape=filter_shape)
		#self.output = 1.7159*T.tanh((self.conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))*(2.0/3.0))

		# maxout : 3
		out1 = conv.conv2d(input, self.W1, image_shape=input_shape, filter_shape=filter_shape, border_mode=border_mode) + self.b1.dimshuffle('x', 0, 'x', 'x')
		out2 = conv.conv2d(input, self.W2, image_shape=input_shape, filter_shape=filter_shape, border_mode=border_mode) + self.b2.dimshuffle('x', 0, 'x', 'x')
		out3 = conv.conv2d(input, self.W3, image_shape=input_shape, filter_shape=filter_shape, border_mode=border_mode) + self.b3.dimshuffle('x', 0, 'x', 'x')

		self.output = T.maximum(out1, T.maximum(out2, out3))

		# save parameter of this layer for back-prop convinience
		self.params = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3]
		insize = input_shape[1] * input_shape[2] * input_shape[3]
		self.paramins = [insize, insize, insize, insize, insize, insize]
コード例 #17
0
def convolutional_model(X, w_1, w_2, w_3, w_4, w_5, w_6, p_1, p_2, p_3, p_4, p_5):
    l1 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(X, w_1, border_mode='full'),0.), (2, 2),ignore_border=True) + b_1.dimshuffle('x', 0, 'x', 'x') ), p_1)
    l2 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(l1, w_2), 0.), (2, 2),ignore_border=True) + b_2.dimshuffle('x', 0, 'x', 'x') ), p_2)
    l3 = dropout(T.flatten(T.tanh( max_pool_2d(T.maximum(conv2d(l2, w_3), 0.), (2, 2),ignore_border=True) + b_3.dimshuffle('x', 0, 'x', 'x') ), outdim=2), p_3)# flatten to switch back to 1d layers
    l4 = dropout(T.maximum(T.dot(l3, w_4), 0.), p_4)
    l5 = dropout(T.maximum(T.dot(l4, w_5), 0.), p_5)
    return T.dot(l5, w_6)
コード例 #18
0
ファイル: objectives.py プロジェクト: CVML/CRCN
 def iter_j(in_matrix, j_out_matrix,out_matrix,k_img_matrix):
     j_out_len=j_out_matrix.shape[0]
     jentity=j_out_matrix[j_out_len-2:]
     j_out_matrix=j_out_matrix[:j_out_len-2]
     score_img_seq=T.maximum(0,seq_score(out_matrix,in_matrix,entity)+1- seq_score(out_matrix,k_img_matrix,entity))
     score_sent_seq=T.maximum(0,seq_score(j_out_matrix,k_img_matrix,jentity)+1- seq_score(out_matrix,k_img_matrix,entity))
     return score_img_seq+score_sent_seq
コード例 #19
0
ファイル: loss.py プロジェクト: RuinCakeLie/breze
    def inner(target, embedding):
        """Return a theano expression of a vector containing the sample wise
        loss of drlim.

        The push_margin, pull_margin and coefficient for the contrastives
        used are %.f, %.f and %.f respectively.

        Parameters
        ----------

        target : array_like
            A vector of length `n`. If 1, sample `2 * n` and sample
            `2 * n + 1` are deemed similar.

        embedding : array_like
            Array containing the embeddings of samples row wise.
        """ % (push_margin, pull_margin, c_contrastive)
        target = target[:, 0]
        n_pair = embedding.shape[0] // 2
        n_feature = embedding.shape[1]

        # Reshape array to get pairs.
        embedding = embedding.reshape((n_pair, n_feature * 2))

        # Calculate distances of pairs.
        diff = (embedding[:, :n_feature] - embedding[:, n_feature:])
        dist = T.sqrt((diff ** 2).sum(axis=1) + 1e-8)

        pull = target * f_pull_loss(T.maximum(0, dist - pull_margin))
        push = (1 - target) * f_push_loss(T.maximum(0, push_margin - dist))

        loss = pull + c_contrastive * push
        return loss.dimshuffle(0, 'x')
コード例 #20
0
ファイル: main.py プロジェクト: ageek/theano-nets
    def _build_activation(self, act=None):
        '''Given an activation description, return a callable that implements it.
        '''
        def compose(a, b):
            c = lambda z: b(a(z))
            c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__, a.__theanets_name__)
            return c
        act = act or self.args.activation.lower()
        if '+' in act:
            return reduce(compose, (self._build_activation(a) for a in act.split('+')))
        options = {
            'tanh': TT.tanh,
            'linear': lambda z: z,
            'logistic': TT.nnet.sigmoid,
            'softplus': TT.nnet.softplus,

            # shorthands
            'relu': lambda z: TT.maximum(0, z),

            # modifiers
            'rect:max': lambda z: TT.minimum(1, z),
            'rect:min': lambda z: TT.maximum(0, z),

            # normalization
            'norm:dc': lambda z: (z.T - z.mean(axis=1)).T,
            'norm:max': lambda z: (z.T / TT.maximum(1e-10, abs(z).max(axis=1))).T,
            'norm:std': lambda z: (z.T / TT.maximum(1e-10, TT.std(z, axis=1))).T,
            }
        for k, v in options.iteritems():
            v.__theanets_name__ = k
        try:
            return options[act]
        except:
            raise KeyError('unknown --activation %s' % act)
コード例 #21
0
def discriminator(x, z, params, mb_size, num_hidden, num_latent):

    x_z = T.concatenate([x,z], axis = 1)


    h_out_1 = DenseLayer((mb_size, num_hidden + num_latent), num_units = num_hidden, nonlinearity=None, W = params['W_disc_1'])

    h_out_2 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_2'])

    h_out_3 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_3'])

    h_out_4 = DenseLayer((mb_size, 1), num_units = 1, nonlinearity=None, W = params['W_disc_4'], b = params['b_disc_4'])

    h_out_1_value = h_out_1.get_output_for(x_z)

    h_out_1_value = T.maximum(0.0, (h_out_1_value - T.mean(h_out_1_value, axis = 0)) / (1.0 + T.std(h_out_1_value, axis = 0)) + params['b_disc_1'])

    h_out_2_value = h_out_2.get_output_for(h_out_1_value)

    h_out_2_value = T.maximum(0.0, (h_out_2_value - T.mean(h_out_2_value, axis = 0)) / (1.0 + T.std(h_out_2_value, axis = 0)) + params['b_disc_2'])

    h_out_3_value = h_out_3.get_output_for(h_out_2_value)

    h_out_3_value = T.maximum(0.0, (h_out_3_value - T.mean(h_out_3_value, axis = 0)) / (1.0 + T.std(h_out_3_value, axis = 0)) + params['b_disc_3'])

    h_out_4_value = h_out_4.get_output_for(h_out_3_value)

    raw_y = h_out_4_value

    classification = T.nnet.sigmoid(raw_y)

    results = {'c' : classification}

    return results
コード例 #22
0
def crop_attention_bilinear(bbox, frame):
	att = bbox
	frame_col = img_col
	frame_row = img_row

	_cx = (att[1] + att[3]) / 2; cx = (_cx + 1) / 2. * frame_col
	_cy = (att[0] + att[2]) / 2; cy = (_cy + 1) / 2. * frame_row
	_w = TT.abs_(att[3] - att[1]) / 2; w = _w * frame_col
	_h = TT.abs_(att[2] - att[0]) / 2; h = _h * frame_row

	dx = w / (att_col - 1)
	dy = h / (att_row - 1)

	mx = cx + dx * (TT.arange(att_col, dtype=T.config.floatX) - (att_col - 1) / 2.)
	my = cy + dy * (TT.arange(att_row, dtype=T.config.floatX) - (att_row - 1) / 2.)

	a = TT.arange(frame_col, dtype=T.config.floatX)
	b = TT.arange(frame_row, dtype=T.config.floatX)

	ax = TT.maximum(0, 1 - TT.abs_(a.dimshuffle(0, 'x') - mx.dimshuffle('x', 0)))
	by = TT.maximum(0, 1 - TT.abs_(b.dimshuffle(0, 'x') - my.dimshuffle('x', 0)))

	bilin = TT.dot(by.T, TT.dot(frame, ax))

	return bilin
コード例 #23
0
    def getOutputs(self, previousMemory, input_layer): 

        print "prev memory dim", previousMemory.ndim
        print "input layer dim", input_layer.ndim
        assert(previousMemory.ndim == input_layer.ndim)

        if previousMemory.ndim == 1: 
            axisConcat = 0
        else:
            axisConcat = 1

        controller_0 = T.maximum(0.0, T.dot(input_layer, self.params["W_controller_0"]) + self.params["b_controller_0"])    

        controller_1 = T.maximum(0.0, T.dot(T.concatenate([controller_0, input_layer], axis = axisConcat), self.params["W_controller_1"]) + self.params["b_controller_1"])

        controller = T.maximum(0.0, T.dot(T.concatenate([controller_1, input_layer], axis = axisConcat), self.params["W_controller"]) + self.params["b_controller"])

        #Have multiple layers in controller?  This determines what gets passed in / out from the network.  

        if self.useReluReadGate: 
            readgate = T.maximum(0.0, (T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_readgate"].T) + self.params["b_readgate"]))
        else:
            readgate = T.nnet.sigmoid(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_readgate"].T) + self.params["b_readgate"])

        readdelta = T.tanh(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_readdelta"].T) + self.params["b_readdelta"])

        keepgate = T.nnet.sigmoid(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_keepgate"].T) + self.params["b_keepgate"])

        memory = previousMemory * keepgate + readgate * readdelta

        writegate = T.nnet.sigmoid(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_writegate"].T) + self.params["b_writegate"])

        output = writegate * T.maximum(0.0, T.dot(T.concatenate([controller, 0.0 * memory, 1.0 * input_layer], axis = axisConcat), self.params["W_output"].T) + self.params["b_output"])

        return memory, output
コード例 #24
0
 def minimize(self, loss, momentum, rescale):
     super(RMSPropOptimizer, self).minimize(loss)
     grads = self.gradparams
     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
     grad_norm = T.sqrt(grad_norm)
     scaling_num = rescale
     scaling_den = T.maximum(rescale, grad_norm)
     # Magic constants
     combination_coeff = 0.9
     minimum_grad = 1E-4
     updates = []
     params = self.params
     for n, (param, grad) in enumerate(zip(params, grads)):
         grad = T.switch(not_finite, 0.1 * param,
                         grad * (scaling_num / scaling_den))
         old_square = self.running_square_[n]
         new_square = combination_coeff * old_square + (
             1. - combination_coeff) * T.sqr(grad)
         old_avg = self.running_avg_[n]
         new_avg = combination_coeff * old_avg + (
             1. - combination_coeff) * grad
         rms_grad = T.sqrt(new_square - new_avg ** 2)
         rms_grad = T.maximum(rms_grad, minimum_grad)
         memory = self.memory_[n]
         update = momentum * memory - self.lr * grad / rms_grad
         update2 = momentum * momentum * memory - (
             1 + momentum) * self.lr * grad / rms_grad
         updates.append((old_square, new_square))
         updates.append((old_avg, new_avg))
         updates.append((memory, update))
         updates.append((param, param + update2))
     
     return updates
コード例 #25
0
def advanced_indexing(volume, *indices_list, **kwargs):
    """ Performs advanced indexing on `volume`.

    This function exists because in Theano<=0.9 advanced indexing is
    only supported along the first dimension.

    Notes
    -----
    Assuming `volume` is C contiguous.
    """
    strides = kwargs.get("strides")
    if strides is None:
        shapes = T.cast(volume.shape[:len(indices_list)], dtype=theano.config.floatX)
        strides = T.concatenate([T.ones((1,)), T.cumprod(shapes[::-1])[:-1]], axis=0)[::-1]

    shapes = T.cast(volume.shape, dtype=theano.config.floatX)

    indices = T.maximum(0, T.minimum(indices_list[-1], shapes[len(indices_list)-1]-1))
    for i in range(len(indices_list)-1):
        clipped_idx = T.maximum(0, T.minimum(indices_list[i], shapes[i]-1))
        indices += clipped_idx * strides[i]

    # indices = T.sum(T.stack(indices_list, axis=1)*strides[:len(indices_list)], axis=1)
    indices = T.cast(indices, dtype="int32")
    return volume.reshape((-1, volume.shape[-1]))[indices]
コード例 #26
0
ファイル: slm.py プロジェクト: yamins81/thoreano
    def init_lpool(self, x, x_shp,
            ker_shape=(3, 3),
            order=1,
            stride=1,
            mode='valid'):

        if hasattr(order, '__iter__'):
            o1 = (order == 1).all()
            o2 = (order == order.astype(np.int)).all()
        else:
            o1 = order == 1
            o2 = (order == int(order))

        if o1:
            r, r_shp = self.boxconv(x, x_shp, ker_shape)
        elif o2:
            r, r_shp = self.boxconv(x ** order, x_shp, ker_shape)
            r = tensor.maximum(r, 0) ** (1.0 / order)
        else:
            r, r_shp = self.boxconv(abs(x) ** order, x_shp, ker_shape)
            r = tensor.maximum(r, 0) ** (1.0 / order)

        if stride > 1:
            r = r[:, :, ::stride, ::stride]
            # intdiv is tricky... so just use numpy
            r_shp = np.empty(r_shp)[:, :, ::stride, ::stride].shape
        return r, r_shp
コード例 #27
0
ファイル: irbm.py プロジェクト: MarcCote/iRBM
    def get_updates(self, v):
        # Contrastive divergence
        chain_end, updates_CD = self.CD(self, chain_start=v, cdk=self.CDk)

        # [Expected] negative log-likelihood
        cost = T.mean(self.free_energy(v), axis=0) - T.mean(self.free_energy(chain_end), axis=0)

        # L2 Regularization
        if isinstance(self.regularize, L2Regularization):
            cost += self.regularization

        # Gradients (use automatic differentiation)
        # We must not compute the gradient through the gibbs sampling, i.e. use consider_constant
        gparams = T.grad(cost, self.parameters, consider_constant=[chain_end])
        gradients = dict(zip(self.parameters, gparams))

        # Get learning rates for all params given their gradient.
        lr, updates_lr = self.learning_rate(gradients)

        updates = OrderedDict()
        updates.update(updates_CD)  # Add updates from CD
        updates.update(updates_lr)  # Add updates from learning_rate

        # Updates parameters
        for param, gparam in gradients.items():
            updates[param] = param - lr[param] * gradients[param]

        if isinstance(self.regularize, L1Regularization):
            updates[self.b] = T.sgn(updates[self.b]) * T.maximum(abs(updates[self.b]) - lr[self.b]*self.regularize.decay, 0)
            updates[self.W] = T.sgn(updates[self.W]) * T.maximum(abs(updates[self.W]) - lr[self.W]*self.regularize.decay, 0)

        return updates
コード例 #28
0
    def setup(self, bottom, top):
        from caffe_helper.theano_util import init_theano
        init_theano()

        import theano as tn
        import theano.tensor as T
        assert len(bottom) == 2
        assert len(top) == 1
        s_y = T.matrix('y')  # y in [-inf, inf]
        s_t = T.matrix('t')  # t in {-1, 0, 1} where 0 is ignored
        s_dloss = T.scalar('dloss')
        # Forward
        # s_loss = T.mean(abs(s_t) * T.log1p(T.exp(-s_y * s_t)))  # unstable
        s_loss = -T.sum(
            abs(s_t) * (
                s_y * ((s_t >= 0) - (s_y >= 0)) - T.log1p(T.exp(-abs(s_y)))))\
            / T.maximum(T.sum(abs(s_t)), 1)
        # Backward
        s_p = 1 / (1 + T.exp(-s_y))
        s_dy = s_dloss * abs(s_t) * (s_p - (s_t >= 0)) / \
            T.maximum(T.sum(abs(s_t)), 1)

        def _o(s):
            return tn.Out(s, borrow=True)
        self.tn_forward = tn.function([s_y, s_t], s_loss)
        self.tn_backward = tn.function([s_y, s_t, s_dloss], _o(s_dy))
コード例 #29
0
 def call(self, X):
     if type(X) is not list or len(X) != 2:
         raise Exception("SquareAttention must be called on a list of two tensors. Got: " + str(X))
         
     frame, position  = X[0], X[1]
     
     # Reshaping the input to exclude the time dimension
     frameShape = K.shape(frame)
     positionShape = K.shape(position)
     (chans, height, width) = frameShape[-3:]
     targetDim = positionShape[-1]
     frame = K.reshape(frame, (-1, chans, height, width))
     position = K.reshape(position, (-1, ) + (targetDim, ))
     
     # Applying the attention
     hw = THT.abs_(position[:, 2] - position[:, 0]) * self.scale / 2.0
     hh = THT.abs_(position[:, 3] - position[:, 1]) * self.scale / 2.0
     position = THT.maximum(THT.set_subtensor(position[:, 0], position[:, 0] - hw), -1.0)
     position = THT.minimum(THT.set_subtensor(position[:, 2], position[:, 2] + hw), 1.0)
     position = THT.maximum(THT.set_subtensor(position[:, 1], position[:, 1] - hh), -1.0)
     position = THT.minimum(THT.set_subtensor(position[:, 3], position[:, 3] + hh), 1.0)
     rX = Data.linspace(-1.0, 1.0, width)
     rY = Data.linspace(-1.0, 1.0, height)
     FX = THT.gt(rX, position[:,0].dimshuffle(0,'x')) * THT.le(rX, position[:,2].dimshuffle(0,'x'))
     FY = THT.gt(rY, position[:,1].dimshuffle(0,'x')) * THT.le(rY, position[:,3].dimshuffle(0,'x'))
     m = FY.dimshuffle(0, 1, 'x') * FX.dimshuffle(0, 'x', 1)
     m = m + self.alpha - THT.gt(m, 0.) * self.alpha
     frame = frame * m.dimshuffle(0, 'x', 1, 2)
     
     # Reshaping the frame to include time dimension
     output = K.reshape(frame, frameShape)
     
     return output
コード例 #30
0
ファイル: TheanoUtil.py プロジェクト: atuxhe/returnn
def tiled_eye(n1, n2, dtype="float32"):
  r1 = T.maximum((n1 - 1) / n2 + 1, 1)
  r2 = T.maximum((n2 - 1) / n1 + 1, 1)
  small_eye = T.eye(T.minimum(n1, n2), dtype=dtype)
  tiled_big = T.tile(small_eye, (r1, r2))
  tiled_part = tiled_big[:n1,:n2]
  return tiled_part
コード例 #31
0
 def dist_info_sym(self, obs_var, state_info_vars=None):
     mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std],
                                          obs_var)
     if self.min_std is not None:
         log_std_var = TT.maximum(log_std_var, np.log(self.min_std))
     return dict(mean=mean_var, log_std=log_std_var)
コード例 #32
0
left_features = U.dot(x_u.T)
right_features = V.dot(x_v.T)

prediction = T.diagonal(T.dot(left_features.T, right_features))

# LOSS FUNCTIONS
# squared loss
loss_squared = T.mean((y - prediction)**2)
# logistic loss (0/1 classiifcation)
prob_prediction = T.nnet.sigmoid(prediction)
loss_log = -T.mean(y * T.log(prob_prediction) +
                   (1 - y) * T.log(1 - prob_prediction))
# e-insensitive loss
epsilon = 0.1
loss_e_insens = T.mean(
    T.maximum(prediction - y - epsilon, T.maximum(0,
                                                  y - prediction - epsilon)))
# hinge loss (-1/1 classification)
loss_hinge = T.mean(T.maximum(-prediction * y + epsilon, 0))

# PENALTIES
cost = loss_log + lambda_u * T.mean(U**2) + lambda_v * T.mean(V**2)
gU, gV = T.grad(cost, [U, V])

learning_rate = 1e-7
momentum_factor = 0.9

train = theano.function(inputs=[y, x_u, x_v],
                        outputs=cost,
                        updates=((U, U + delta_U), (V, V + delta_V),
                                 (delta_U, momentum_factor * delta_U -
                                  (1 - momentum_factor) * learning_rate * gU),
コード例 #33
0
ファイル: cifar10_GXNOR.py プロジェクト: zdcuob/GXNOR-Net
                discrete=discrete,
                H=H,
		N=N,
                nonlinearity=lasagne.nonlinearities.identity,  #identity
                num_units=10)   
                      
    cnn = lasagne.layers.BatchNormLayer(
            cnn,
            epsilon=epsilon, 
            alpha=alpha)
    
    train_output = lasagne.layers.get_output(cnn, deterministic=False)
    best_params = lasagne.layers.get_all_params(cnn, discrete=True)
    
    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))
	
    if discrete:         
        updates = discrete_grads(loss,cnn,LR)
        params = lasagne.layers.get_all_params(cnn, trainable=True, discrete=False)
        updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items())
        
    else:
        params = lasagne.layers.get_all_params(cnn, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)


    test_output = lasagne.layers.get_output(cnn, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX)
     
コード例 #34
0
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        npz_path=None,
        freeze_lst=None,
        reinit_lst=None,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        mean_network=None,
        std_network=None,
        dist_cls=DiagonalGaussian,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        :return:
        """
        Serializable.quick_init(self, locals())
        # reinit_lst = None
        assert isinstance(env_spec.action_space, Box)
        if init_std is None:
            init_std = 1.0
            set_std_params = False
        else:
            set_std_params = True

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        if mean_network is None:
            mean_network = MLP(
                input_shape=(obs_dim, ),
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )
        self._mean_network = mean_network
        self._layers_mean = mean_network.layers

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if std_network is not None:
            l_log_std = std_network.output_layer
        else:
            if adaptive_std:
                std_network = MLP(
                    input_shape=(obs_dim, ),
                    input_layer=mean_network.input_layer,
                    output_dim=action_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=None,
                )
                l_log_std = std_network.output_layer
                self._layers_log_std = std_network.layers
            else:
                l_log_std = ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=lasagne.init.Constant(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )
                self._layers_log_std = [l_log_std]

        self._layers = self._layers_mean + self._layers_log_std

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = dist_cls(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        if npz_path is not None:
            param_dict = dict(
                np.load(os.path.join(config.PROJECT_PATH, npz_path)))
            param_values = param_dict['params']
            # todo: don't forget about this
            if set_std_params:
                self.set_param_values(param_values)
            else:
                self.set_param_values_transfer(param_values)

        if freeze_lst is not None:
            assert len(freeze_lst) == len(self._layers) - 1
            for layer, should_freeze in zip(self._layers[1:], freeze_lst):
                if should_freeze:
                    for param, tags in layer.params.items():
                        tags.remove("trainable")
        if reinit_lst is not None:
            assert len(freeze_lst) == len(
                self._layers) - 1  # since input layer is counted
            for layer, should_reinit in zip(self._layers[1:], reinit_lst):
                if should_reinit:
                    print("reinitialized")
                    for v in layer.params:
                        val = v.get_value()
                        if (len(val.shape) < 2):
                            v.set_value(lasagne.init.Constant(0.0)(val.shape))
                        else:
                            v.set_value(lasagne.init.GlorotUniform()(
                                val.shape))
                else:
                    print("did not reinit")
コード例 #35
0
ファイル: train.py プロジェクト: lpigou/chalearn2014
def std_norm(_x, axis=[-3, -2, -1]):
    return _x / T.maximum(1e-4, T.std(_x, axis=axis, keepdims=True))
コード例 #36
0
ファイル: train.py プロジェクト: lpigou/chalearn2014
def var_norm(_x, imgs=True, axis=[-3, -2, -1]):
    if imgs:
        return (_x - T.mean(_x, axis=axis, keepdims=True)) / T.maximum(
            1e-4, T.std(_x, axis=axis, keepdims=True))
    return (_x - T.mean(_x)) / T.maximum(1e-4, T.std(_x))
コード例 #37
0
def ReLU(x):
    return T.maximum(0.0, x)
コード例 #38
0
ファイル: tutorial2.py プロジェクト: linyihan2013/DataMining
def rectify(X):
    return T.maximum(X, 0.)
コード例 #39
0
def run_training(monitor_filename=None,
                 random_seed=config.random_seed,
                 coeff_embed=config.coeff_embed):
    # For multi run seeds, close current log files.
    for tap in [stdout_tap, stderr_tap]:
        if tap.file is not None:
            tap.file.close()
            tap.file = None

    # Sanity check network type.

    if config.network_type not in ['pi', 'tempens']:
        print("Unknown network type '%s'." % config.network_type)
        exit()

    np.random.seed(random_seed)
    # Create the result directory and basic run data.
    run_desc = config.run_desc + (
        '_%s%s_%04d_embed%.1f' %
        (config.dataset_str, config.num_labels_str, random_seed, coeff_embed))

    result_subdir = report.create_result_subdir(config.result_dir, run_desc)
    print("Saving results to", result_subdir)

    # Start dumping stdout and stderr into result directory.

    stdout_tap.set_file(open(os.path.join(result_subdir, 'stdout.txt'), 'wt'))
    stderr_tap.set_file(open(os.path.join(result_subdir, 'stderr.txt'), 'wt'))

    # Set window title if on Windows.

    try:
        import ctypes
        ctypes.windll.kernel32.SetConsoleTitleA(
            '%s - Gpu %d' %
            (os.path.split(result_subdir)[1], config.cuda_device_number))
    except:
        pass

    # Export run information.

    report.export_sources(os.path.join(result_subdir, 'src'))
    report.export_run_details(os.path.join(result_subdir, 'run.txt'))
    report.export_config(os.path.join(result_subdir, 'config.txt'))

    # Load the dataset.

    print("Loading dataset '%s'..." % config.dataset)

    if config.dataset == 'cifar-10':
        X_train, y_train, X_test, y_test = load_cifar_10()
    elif config.dataset == 'cifar-100':
        X_train, y_train, X_test, y_test = load_cifar_100()
    elif config.dataset == 'svhn':
        X_train, y_train, X_test, y_test = load_svhn()
    elif config.dataset == 'mnist':
        X_train, y_train, X_test, y_test = load_mnist_realval()
    else:
        print("Unknown dataset '%s'." % config.dataset)
        exit()

    # Calculate number of classes.

    num_classes = len(set(y_train))
    assert (set(y_train) == set(y_test) == set(range(num_classes))
            )  # Check that all labels are in range [0, num_classes-1]
    print("Found %d classes in training set, %d in test set." %
          (len(set(y_train)), len(set(y_test))))

    # Prepare dataset and print stats.

    X_train, y_train, mask_train, X_test, y_test = prepare_dataset(
        result_subdir, X_train, y_train, X_test, y_test, num_classes)
    print("Got %d training inputs, out of which %d are labeled." %
          (len(X_train), sum(mask_train)))
    print("Got %d test inputs." % len(X_test))
    print("Shapes:", X_train.shape, y_train.shape, X_test.shape, y_test.shape)

    # ----------------------------------------------------------------------------
    # Prepare to train.
    # ----------------------------------------------------------------------------

    print("Network type is '%s'." % config.network_type)

    # Prepare Theano variables for inputs and targets

    input_var = T.tensor4('inputs')
    label_var = T.ivector('labels')
    learning_rate_var = T.scalar('learning_rate')
    adam_beta1_var = T.scalar('adam_beta1')
    input_vars = [input_var]

    scaled_unsup_weight_max = config.unsup_weight_max
    if config.num_labels != 'all':
        scaled_unsup_weight_max *= 1.0 * config.num_labels / X_train.shape[0]

    if config.network_type == 'pi':
        input_b_var = T.tensor4('inputs_b')
        mask_var = T.vector('mask')
        unsup_weight_var = T.scalar('unsup_weight')
        input_vars.append(input_b_var)
    elif config.network_type == 'tempens':
        mask_var = T.vector('mask')
        target_var = T.matrix('targets')
        unsup_weight_var = T.scalar('unsup_weight')

    # Load/create the network.

    if config.load_network_filename is not None:
        net, net_em, input_var = load_network(config.load_network_filename)
        input_vars = [input_var]
        if config.network_type == 'pi':
            input_vars.append(input_b_var)
    else:
        print("Building network and compiling functions...")
        net, net_em = build_network(input_var, X_train.shape[1], num_classes)

    # Export topology report.

    with open(os.path.join(result_subdir, 'network-topology.txt'),
              'wt') as fout:
        for line in report.generate_network_topology_info(net):
            print(line)
            fout.write(line + '\n')

    # Initialization updates and function.

    ll.get_output(net, init=True)
    init_updates = [
        u for l in ll.get_all_layers(net)
        for u in getattr(l, 'init_updates', [])
    ]
    init_fn = theano.function(input_vars, [],
                              updates=init_updates,
                              on_unused_input='ignore')

    # Get training predictions, BN updates.

    train_prediction, train_embedding = ll.get_output([net, net_em])
    # train_embedding = ll.get_output(net_em)
    if config.network_type == 'pi':
        train_prediction_b = ll.get_output(
            net, inputs=input_b_var)  # Second branch.
    bn_updates = [
        u for l in ll.get_all_layers(net)
        for u in getattr(l, 'bn_updates', [])
    ]

    # Training loss.
    train_loss = T.mean(categorical_crossentropy(train_prediction, label_var) *
                        mask_var,
                        dtype=theano.config.floatX,
                        acc_dtype=theano.config.floatX)

    if config.network_type == 'pi':
        if config.consis:
            train_loss += unsup_weight_var * T.mean(
                squared_error(train_prediction, train_prediction_b),
                dtype=theano.config.floatX,
                acc_dtype=theano.config.floatX)
        target_hard = T.argmax(train_prediction_b, axis=1)

    elif config.network_type == 'tempens':
        if config.consis:
            train_loss += unsup_weight_var * T.mean(
                squared_error(train_prediction, target_var),
                dtype=theano.config.floatX,
                acc_dtype=theano.config.floatX)
        target_hard = T.argmax(target_var, axis=1)

    if config.merge is True:
        merged_tar = mask_var * \
                     T.cast(label_var, dtype=theano.config.floatX) \
                     + (1. - mask_var) * \
                     T.cast(target_hard, dtype=theano.config.floatX)
    else:
        merged_tar = target_hard

    emb_eucd2 = T.mean(squared_error(
        train_embedding[:config.minibatch_size // 2],
        train_embedding[config.minibatch_size // 2:]),
                       axis=1)
    neighbor_var = T.eq(merged_tar[:config.minibatch_size // 2],
                        merged_tar[config.minibatch_size // 2:])

    emb_eucd = T.sqrt(emb_eucd2)
    margin = T.constant(config.margin,
                        dtype=theano.config.floatX,
                        name='margin')
    neighbor_var = T.cast(neighbor_var, dtype=theano.config.floatX)
    pos = neighbor_var * emb_eucd2
    neg = (1. - neighbor_var) * T.square(T.maximum(margin - emb_eucd, 0))
    emb_loss = T.mean(pos + neg)
    train_loss += unsup_weight_var * emb_loss * coeff_embed

    # Entropy minimization
    if config.coeff_entropy:
        train_loss += config.coeff_entropy * unsup_weight_var * T.mean(
            lasagne.objectives.categorical_crossentropy(
                train_prediction, train_prediction))

    # ADAM update expressions for training.
    params = ll.get_all_params(net, trainable=True)
    updates = robust_adam(train_loss,
                          params,
                          learning_rate=learning_rate_var,
                          beta1=adam_beta1_var,
                          beta2=config.adam_beta2,
                          epsilon=config.adam_epsilon).items()

    # EMA
    param_avg = [
        theano.shared(np.cast[theano.config.floatX](0. * p.get_value()))
        for p in params
    ]
    avg_updates = [(a, a + config.ema_decay * (p - a))
                   for p, a in zip(params, param_avg)]
    avg_givens = [(p, a) for p, a in zip(params, param_avg)]

    # Training function.

    if config.network_type == 'pi':
        train_fn = theano_utils.function([
            input_var, input_b_var, label_var, mask_var, learning_rate_var,
            adam_beta1_var, unsup_weight_var
        ], [train_loss],
                                         updates=updates + bn_updates +
                                         avg_updates,
                                         on_unused_input='warn')
    elif config.network_type == 'tempens':
        train_fn = theano_utils.function([
            input_var, label_var, mask_var, target_var, learning_rate_var,
            adam_beta1_var, unsup_weight_var
        ], [train_loss, train_prediction],
                                         updates=updates + bn_updates +
                                         avg_updates,
                                         on_unused_input='warn')

    # Validation prediction, loss, and accuracy.

    test_prediction = ll.get_output(net, deterministic=True)
    test_loss = T.mean(categorical_crossentropy(test_prediction, label_var),
                       dtype=theano.config.floatX,
                       acc_dtype=theano.config.floatX)
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), label_var),
                      dtype=theano.config.floatX,
                      acc_dtype=theano.config.floatX)

    # EMA output function.
    # ema_fn = theano_utils.function([input_var], [test_prediction], on_unused_input='warn', givens=avg_givens)

    # Validation function.

    val_fn = theano_utils.function([input_var, label_var],
                                   [test_loss, test_acc],
                                   on_unused_input='warn',
                                   givens=avg_givens)

    # ----------------------------------------------------------------------------
    # Start training.
    # ----------------------------------------------------------------------------

    print("Starting training.")

    if config.max_unlabeled_per_epoch is not None:
        print("Limiting number of unlabeled inputs per epoch to %d." %
              config.max_unlabeled_per_epoch)

    training_csv = report.GenericCSV(
        os.path.join(result_subdir, 'training.csv'), 'Epoch', 'EpochTime',
        'TrainLoss', 'TestLoss', 'TestAccuracy', 'LearningRate')

    # Initial training variables for temporal ensembling.

    if config.network_type == 'tempens':
        ensemble_prediction = np.zeros((len(X_train), num_classes))
        training_targets = np.zeros((len(X_train), num_classes))

    # ----------------------------------------------------------------------------
    # Training loop.
    # ----------------------------------------------------------------------------

    for epoch in range(config.start_epoch, config.num_epochs):

        # Export network snapshot every 50 epochs.

        if (epoch % 50) == 0 and epoch != config.start_epoch:
            save_network(
                net,
                os.path.join(result_subdir,
                             'network-snapshot-%03d.pkl' % epoch))

        # Evaluate up/down ramps.

        rampup_value = rampup(epoch)
        rampdown_value = rampdown(epoch)

        # Initialize WN/MOBN layers with a properly augmented minibatch.

        if epoch == 0:
            if config.network_type == 'pi':
                minibatches = iterate_minibatches_augment_pi(
                    X_train, np.zeros((len(X_train), )),
                    np.zeros((len(X_train), )), config.minibatch_size)
                for (n, indices, inputs_a, inputs_b, labels,
                     mask) in minibatches:
                    init_fn(inputs_a, inputs_b)
                    break
            elif config.network_type == 'tempens':
                minibatches = iterate_minibatches_augment_tempens(
                    X_train, np.zeros((len(X_train), )),
                    np.zeros((len(X_train), )), np.zeros((len(X_train), )),
                    config.minibatch_size)
                for (n, indices, inputs, labels, mask, targets) in minibatches:
                    init_fn(inputs)
                    break

        # Initialize epoch predictions for temporal ensembling.

        if config.network_type == 'tempens':
            epoch_predictions = np.zeros((len(X_train), num_classes))
            epoch_execmask = np.zeros(
                len(X_train))  # Which inputs were executed.
            training_targets = floatX(training_targets)

        # Training pass.

        start_time = time.time()
        train_err, train_n = 0., 0.

        learning_rate = rampup_value * rampdown_value * config.learning_rate_max
        adam_beta1 = rampdown_value * config.adam_beta1 + (
            1.0 - rampdown_value) * config.rampdown_beta1_target
        unsup_weight = rampup_value * scaled_unsup_weight_max
        if epoch == config.start_epoch:
            unsup_weight = 0.0

        with thread_utils.ThreadPool(8) as thread_pool:
            if config.network_type == 'pi':
                minibatches = iterate_minibatches_augment_pi(
                    X_train, y_train, mask_train, config.minibatch_size)
                minibatches = thread_utils.run_iterator_concurrently(
                    minibatches, thread_pool)
                for (n, indices, inputs_a, inputs_b, labels,
                     mask) in minibatches:
                    (e_train, ) = train_fn(inputs_a, inputs_b, labels, mask,
                                           floatX(learning_rate),
                                           floatX(adam_beta1),
                                           floatX(unsup_weight))
                    train_err += e_train * n
                    train_n += n
            elif config.network_type == 'tempens':
                minibatches = iterate_minibatches_augment_tempens(
                    X_train, y_train, mask_train, training_targets,
                    config.minibatch_size)
                minibatches = thread_utils.run_iterator_concurrently(
                    minibatches, thread_pool)
                for (n, indices, inputs, labels, mask, targets) in minibatches:
                    (e_train, prediction) = train_fn(inputs, labels, mask,
                                                     targets,
                                                     floatX(learning_rate),
                                                     floatX(adam_beta1),
                                                     floatX(unsup_weight))
                    for i, j in enumerate(indices):
                        epoch_predictions[j] = prediction[
                            i]  # Gather epoch predictions.
                        epoch_execmask[j] = 1.0
                    train_err += e_train * n
                    train_n += n

        # Test pass.

        val_err, val_acc, val_n = 0., 0., 0.
        with thread_utils.ThreadPool(8) as thread_pool:
            minibatches = iterate_minibatches(X_test, y_test,
                                              config.minibatch_size)
            minibatches = thread_utils.run_iterator_concurrently(
                minibatches, thread_pool)
            for (n, inputs, labels) in minibatches:
                err, acc = val_fn(inputs, labels)
                val_err += err * n
                val_acc += acc * n
                val_n += n

        if config.network_type == 'tempens':
            if config.max_unlabeled_per_epoch is None:
                # Basic mode.
                ensemble_prediction = (
                    config.prediction_decay * ensemble_prediction
                ) + (1.0 - config.prediction_decay) * epoch_predictions
                training_targets = ensemble_prediction / (
                    1.0 - config.prediction_decay**(
                        (epoch - config.start_epoch) + 1.0))
            else:
                # Sparse updates.
                epoch_execmask = epoch_execmask.reshape(-1, 1)
                ensemble_prediction = epoch_execmask * (
                    config.prediction_decay * ensemble_prediction +
                    (1.0 - config.prediction_decay) * epoch_predictions) + (
                        1.0 - epoch_execmask) * ensemble_prediction
                training_targets = ensemble_prediction / (
                    np.sum(ensemble_prediction, axis=1, keepdims=True) + 1e-8
                )  # Normalize

        # Export stats.

        training_csv.add_data(epoch,
                              time.time() - start_time, train_err / train_n,
                              val_err / val_n, val_acc / val_n * 100.0,
                              learning_rate)

        # Export progress monitor data.

        if monitor_filename is not None:
            with open(monitor_filename, 'wt') as f:
                json.dump(
                    {
                        "loss": 1.0 - val_acc / val_n,
                        "cur_epoch": (epoch + 1),
                        "max_epoch": config.num_epochs
                    }, f)

        # Print stats.

        print(
            "Epoch %3d of %3d took %6.3fs   Loss %.7f, %.7f  Acc=%5.2f  LR=%.7f"
            %
            (epoch, config.num_epochs, time.time() - start_time, train_err /
             train_n, val_err / val_n, val_acc / val_n * 100.0, learning_rate))

    # ----------------------------------------------------------------------------
    # Save and exit.
    # ----------------------------------------------------------------------------

    training_csv.close()
    print("Saving the final network.")
    np.savez(os.path.join(result_subdir, 'network-final.npz'),
             *ll.get_all_param_values(net))
    save_network(net, os.path.join(result_subdir, 'network-final.pkl'))
    print("Done.")
コード例 #40
0
 def rectify(flatten_input_matrix):
     return T.maximum(flatten_input_matrix, 0.)
コード例 #41
0
ファイル: ann.py プロジェクト: kittleik/aiprog
 def rectify(self, X):
     return T.maximum(X, 0.)
コード例 #42
0
    def run_experiment(self, dataset, word_embedding, exp_name):

        # load parameters
        num_maps_word = self.options["num_maps_word"]
        drop_rate_word = self.options["drop_rate_word"]
        drop_rate_sentence = self.options["drop_rate_sentence"]
        word_window = self.options["word_window"]
        word_dim = self.options["word_dim"]
        k_max_word = self.options["k_max_word"]
        k_max_sentence = self.options["k_max_sentence"]
        batch_size = self.options["batch_size"]
        rho = self.options["rho"]
        epsilon = self.options["epsilon"]
        norm_lim = self.options["norm_lim"]
        max_iteration = self.options["max_iteration"]
        k_portion = self.options["k_portion"]
        num_maps_sentence = self.options["num_maps_sentence"]
        sentence_window = self.options["sentence_window"]

        sentence_len = len(dataset[0][0][0][0])
        sentence_num = len(dataset[0][0][0])

        # compute the sentence flags
        train_flags, test_flags = construct_sentence_flag(dataset)
        train_k_value = construct_dynamic_k(train_flags, k_portion)
        test_k_value = construct_dynamic_k(test_flags, k_portion)

        train_flags = theano.shared(value=np.asarray(
            train_flags, dtype=theano.config.floatX),
                                    borrow=True)
        test_flags = theano.shared(value=np.asarray(
            test_flags, dtype=theano.config.floatX),
                                   borrow=True)

        train_k = theano.shared(value=np.asarray(train_k_value,
                                                 dtype=theano.config.floatX),
                                borrow=True)
        test_k = theano.shared(value=np.asarray(test_k_value,
                                                dtype=theano.config.floatX),
                               borrow=True)

        # define the parameters
        x = T.tensor3("x")
        y = T.ivector("y")
        sen_flags = T.matrix("flag")
        sen_k = T.matrix("sen_k")
        rng = np.random.RandomState(1234)

        words = theano.shared(value=np.asarray(word_embedding,
                                               dtype=theano.config.floatX),
                              name="embedding",
                              borrow=True)
        zero_vector_tensor = T.vector()
        zero_vec = np.zeros(word_dim, dtype=theano.config.floatX)
        set_zero = theano.function(
            [zero_vector_tensor],
            updates=[(words, T.set_subtensor(words[0, :],
                                             zero_vector_tensor))])

        x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape(
            (x.shape[0] * x.shape[1], 1, x.shape[2], words.shape[1]))

        dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word)

        # compute convolution on words layer
        word_filter_shape = (num_maps_word, 1, word_window, word_dim)
        word_pool_size = (sentence_len - word_window + 1, 1)
        dropout_word_conv = nn.ConvPoolLayer(rng,
                                             input=dropout_x_emb,
                                             input_shape=None,
                                             filter_shape=word_filter_shape,
                                             pool_size=word_pool_size,
                                             activation=Tanh,
                                             k=k_max_word)
        sent_vec_dim = num_maps_word * k_max_word
        dropout_sent_vec = dropout_word_conv.output.reshape(
            (x.shape[0], 1, x.shape[1], sent_vec_dim))
        dropout_sent_vec = nn.dropout_from_layer(rng, dropout_sent_vec,
                                                 drop_rate_sentence)

        word_conv = nn.ConvPoolLayer(rng,
                                     input=dropout_x_emb *
                                     (1 - drop_rate_word),
                                     input_shape=None,
                                     filter_shape=word_filter_shape,
                                     pool_size=word_pool_size,
                                     activation=Tanh,
                                     k=k_max_word,
                                     W=dropout_word_conv.W,
                                     b=dropout_word_conv.b)
        sent_vec = word_conv.output.reshape(
            (x.shape[0], 1, x.shape[1], sent_vec_dim))
        sent_vec = sent_vec * (1 - drop_rate_sentence)

        # construct doc level context information
        sent_filter_shape = (num_maps_sentence, 1, sentence_window,
                             sent_vec_dim)
        sent_pool_size = (sentence_num - sentence_window + 1, 1)
        dropout_sent_conv = nn.ConvPoolLayer(rng,
                                             input=dropout_sent_vec,
                                             input_shape=None,
                                             filter_shape=sent_filter_shape,
                                             pool_size=sent_pool_size,
                                             activation=Tanh,
                                             k=k_max_sentence)

        sent_conv = nn.ConvPoolLayer(rng,
                                     input=sent_vec,
                                     input_shape=None,
                                     filter_shape=sent_filter_shape,
                                     pool_size=sent_pool_size,
                                     activation=Tanh,
                                     k=k_max_sentence,
                                     W=dropout_sent_conv.W,
                                     b=dropout_sent_conv.b)

        # reshape the sentence vec
        dropout_sent_vec = dropout_sent_vec.reshape(
            (x.shape[0], x.shape[1], sent_vec_dim))
        sent_vec = sent_vec.reshape((x.shape[0], x.shape[1], sent_vec_dim))

        dropout_doc_vec = dropout_sent_conv.output.flatten(2)
        doc_vec = sent_conv.output.flatten(2)
        doc_vec_dim = num_maps_sentence * k_max_sentence

        # concatenate the doc vec along with the sentence vector
        con_dropout_sent_vec = T.concatenate([
            dropout_sent_vec,
            T.tile(dropout_doc_vec, [1, x.shape[1]]).reshape(
                (x.shape[0], x.shape[1], doc_vec_dim))
        ],
                                             axis=2).reshape(
                                                 (x.shape[0] * x.shape[1],
                                                  sent_vec_dim + doc_vec_dim))

        con_sent_vec = T.concatenate([
            sent_vec,
            T.tile(doc_vec, [1, x.shape[1]]).reshape(
                (x.shape[0], x.shape[1], doc_vec_dim))
        ],
                                     axis=2).reshape(
                                         (x.shape[0] * x.shape[1],
                                          sent_vec_dim + doc_vec_dim))

        # construct sentence level classifier
        n_in = sent_vec_dim + doc_vec_dim
        n_out = 1
        sen_W_values = np.zeros((n_in, n_out), dtype=theano.config.floatX)
        sen_W = theano.shared(value=sen_W_values, borrow=True, name="logis_W")
        sen_b_value = nn.as_floatX(0.0)
        sen_b = theano.shared(value=sen_b_value, borrow=True, name="logis_b")

        drop_sent_prob = T.nnet.sigmoid(
            T.dot(con_dropout_sent_vec, sen_W) + sen_b)
        sent_prob = T.nnet.sigmoid(T.dot(con_sent_vec, sen_W) + sen_b)

        # reform the sent vec to doc level
        drop_sent_prob = drop_sent_prob.reshape((x.shape[0], x.shape[1]))
        sent_prob = sent_prob.reshape((x.shape[0], x.shape[1]))

        # using the dynamic top k max probability as bag level probability
        # compute the dynamic K for each documents
        drop_doc_prob = T.sum(T.sort(drop_sent_prob, axis=1) * sen_k,
                              axis=1) / T.sum(sen_k, axis=1)
        doc_prob = T.sum(T.sort(sent_prob, axis=1) * sen_k, axis=1) / T.sum(
            sen_k, axis=1)

        drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7),
                               nn.as_floatX(1 - 1e-7))
        doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7))

        doc_preds = doc_prob > 0.5

        # instance level cost
        drop_sent_cost = T.sum(
            T.maximum(
                0.0,
                nn.as_floatX(.5) - T.sgn(
                    drop_sent_prob.reshape((x.shape[0] * x.shape[1], n_out)) -
                    nn.as_floatX(0.6)) * T.dot(con_dropout_sent_vec, sen_W)) *
            sen_flags.reshape(
                (x.shape[0] * x.shape[1], n_out))) / T.sum(sen_flags)

        # we need that the most positive instance at least 0.7 in pos bags
        # and at most 0.1 in neg bags
        # we want the number of positive instance should at least ...
        # and non of the positive instances in the negative bags

        # compute the number of positive instance
        positive_count = T.sum((drop_sent_prob * sen_flags) > 0.5, axis=1)
        pos_cost = T.maximum(nn.as_floatX(0.0),
                             positive_count - T.sum(sen_k, axis=1))
        neg_cost = T.maximum(nn.as_floatX(0.0), positive_count)
        penal_cost = T.mean(pos_cost * y + neg_cost * (nn.as_floatX(1.0) - y))

        # add the sentence similarity constrains
        sen_sen = T.dot(con_dropout_sent_vec, con_dropout_sent_vec.T)
        sen_sqr = T.sum(con_dropout_sent_vec**2, axis=1)
        sen_sqr_left = sen_sqr.dimshuffle(0, 'x')
        sen_sqr_right = sen_sqr.dimshuffle('x', 0)
        sen_sim_matrix = sen_sqr_left - 2 * sen_sen + sen_sqr_right
        sen_sim_matrix = T.exp(-1 * sen_sim_matrix)

        sen_sim_prob = drop_sent_prob.reshape(
            (x.shape[0] * x.shape[1], 1)) - drop_sent_prob.flatten()
        sen_sim_prob = sen_sim_prob**2

        sen_sim_flag = T.dot(sen_flags.reshape((x.shape[0] * x.shape[1], 1)),
                             sen_flags.reshape((1, x.shape[0] * x.shape[1])))

        sen_sim_cost = T.sum(
            sen_sim_matrix * sen_sim_prob * sen_sim_flag) / T.sum(sen_sim_flag)

        # bag level cost
        drop_bag_cost = T.mean(-y * T.log(drop_doc_prob) * nn.as_floatX(0.6) -
                               (1 - y) * T.log(1 - drop_doc_prob) *
                               nn.as_floatX(0.4))
        drop_cost = drop_bag_cost * nn.as_floatX(0.6) + \
            drop_sent_cost * nn.as_floatX(0.1) + \
            penal_cost * nn.as_floatX(0.5) + \
            sen_sim_cost * nn.as_floatX(0.0001)

        # collect parameters
        self.params.append(words)
        self.params += dropout_word_conv.params
        self.params += dropout_sent_conv.params
        self.params.append(sen_W)
        self.params.append(sen_b)

        grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho,
                                               epsilon, norm_lim)

        # construct the dataset
        # random the
        train_x, train_y = nn.shared_dataset(dataset[0])
        test_x, test_y = nn.shared_dataset(dataset[1])
        test_cpu_y = dataset[1][1]

        n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
        n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

        # construt the model
        index = T.iscalar()
        train_func = theano.function(
            [index], [
                drop_cost, drop_bag_cost, drop_sent_cost, penal_cost,
                sen_sim_cost
            ],
            updates=grad_updates,
            givens={
                x: train_x[index * batch_size:(index + 1) * batch_size],
                y: train_y[index * batch_size:(index + 1) * batch_size],
                sen_flags:
                train_flags[index * batch_size:(index + 1) * batch_size],
                sen_k: train_k[index * batch_size:(index + 1) * batch_size]
            })

        test_func = theano.function(
            [index],
            doc_preds,
            givens={
                x: test_x[index * batch_size:(index + 1) * batch_size],
                sen_k: test_k[index * batch_size:(index + 1) * batch_size]
            })

        get_train_sent_prob = theano.function(
            [index],
            sent_prob,
            givens={x: train_x[index * batch_size:(index + 1) * batch_size]})

        get_test_sent_prob = theano.function(
            [index],
            sent_prob,
            givens={x: test_x[index * batch_size:(index + 1) * batch_size]})

        epoch = 0
        best_score = 0

        log_file = open("./log/%s.log" % exp_name, 'w')

        while epoch <= max_iteration:
            start_time = timeit.default_timer()
            epoch += 1
            costs = []

            for mini_index in np.random.permutation(range(n_train_batches)):
                cost_epoch = train_func(mini_index)
                costs.append(cost_epoch)
                set_zero(zero_vec)

            total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost, train_sim_cost = zip(
                *costs)
            print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f sim cost %f\n" % (
                epoch, np.mean(total_train_cost), np.mean(train_bag_cost),
                np.mean(train_sent_cost), np.mean(train_penal_cost),
                np.mean(train_sim_cost))

            if epoch % 1 == 0:
                test_preds = []
                for i in xrange(n_test_batches):
                    test_y_pred = test_func(i)
                    test_preds.append(test_y_pred)
                test_preds = np.concatenate(test_preds)
                test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds))

                precision, recall, beta, support = precision_recall_fscore_support(
                    test_cpu_y, test_preds, pos_label=1)

                if beta[1] > best_score or epoch % 5 == 0:
                    best_score = beta[1]
                    # save the sentence vectors
                    train_sens = [
                        get_train_sent_prob(i) for i in range(n_train_batches)
                    ]
                    test_sens = [
                        get_test_sent_prob(i) for i in range(n_test_batches)
                    ]

                    train_sens = np.concatenate(train_sens, axis=0)
                    test_sens = np.concatenate(test_sens, axis=0)

                    out_train_sent_file = "./results/%s_train_sent_%d.vec" % (
                        exp_name, epoch)
                    out_test_sent_file = "./results/%s_test_sent_%d.vec" % (
                        exp_name, epoch)

                    with open(out_test_sent_file,
                              'w') as test_f, open(out_train_sent_file,
                                                   'w') as train_f:
                        cPickle.dump(train_sens, train_f)
                        cPickle.dump(test_sens, test_f)
                    print "Get best performace at %d iteration %f" % (
                        epoch, test_score)
                    log_file.write(
                        "Get best performance at %d iteration %f \n" %
                        (epoch, test_score))

                end_time = timeit.default_timer()
                print "Iteration %d , precision, recall, f1" % epoch, precision, recall, beta
                log_file.write(
                    "Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , neg f1 %f, pos f1 %f, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n"
                    % (epoch, precision[0], precision[1], recall[0], recall[1],
                       beta[0], beta[1], np.mean(total_train_cost),
                       np.mean(train_bag_cost), np.mean(train_sent_cost),
                       np.mean(train_penal_cost)))
                print "Using time %f m" % ((end_time - start_time) / 60.)
                log_file.write("Uing time %f m\n" %
                               ((end_time - start_time) / 60.))
            end_time = timeit.default_timer()
            print "Iteration %d Using time %f m" % (epoch,
                                                    (end_time - start_time) /
                                                    60.)
            log_file.write("Uing time %f m\n" %
                           ((end_time - start_time) / 60.))
            log_file.flush()

        log_file.close()
コード例 #43
0
def adamax(loss_or_grads=None,
           params=None,
           learning_rate=0.002,
           beta1=0.9,
           beta2=0.999,
           epsilon=1e-8):
    """Adamax updates

    Adamax updates implemented as in [1]_. This is a variant of the Adam
    algorithm based on the infinity norm.

    Parameters
    ----------
    loss_or_grads: symbolic expression or list of expressions
        A scalar loss expression, or a list of gradient expressions
    params: list of shared variables
        The variables to generate update expressions for
    learning_rate: float
        Learning rate
    beta1: float
        Exponential decay rate for the first moment estimates.
    beta2: float
        Exponential decay rate for the weighted infinity norm estimates.
    epsilon: float
        Constant for numerical stability.

    Returns
    -------
    OrderedDict
        A dictionary mapping each parameter to its update expression

    Notes
    -----
    Optimizer can be called without both loss_or_grads and params
    in that case partial function is returned

    References
    ----------
    .. [1] Kingma, Diederik, and Jimmy Ba (2014):
           Adam: A Method for Stochastic Optimization.
           arXiv preprint arXiv:1412.6980.

    Examples
    --------
    >>> a = theano.shared(1.)
    >>> b = a*2
    >>> updates = adamax(b, [a], learning_rate=.01)
    >>> isinstance(updates, dict)
    True
    >>> optimizer = adamax(learning_rate=.01)
    >>> callable(optimizer)
    True
    >>> updates = optimizer(b, [a])
    >>> isinstance(updates, dict)
    True
    """
    if loss_or_grads is None and params is None:
        return partial(adamax, **_get_call_kwargs(locals()))
    elif loss_or_grads is None or params is None:
        raise ValueError(
            "Please provide both `loss_or_grads` and `params` to get updates")
    all_grads = get_or_compute_grads(loss_or_grads, params)
    t_prev = theano.shared(pm.theanof.floatX(0.0))
    updates = OrderedDict()

    # Using theano constant to prevent upcasting of float32
    one = tt.constant(1)

    t = t_prev + 1
    a_t = learning_rate / (one - beta1**t)

    for param, g_t in zip(params, all_grads):
        value = param.get_value(borrow=True)
        m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)
        u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                               broadcastable=param.broadcastable)

        m_t = beta1 * m_prev + (one - beta1) * g_t
        u_t = tt.maximum(beta2 * u_prev, abs(g_t))
        step = a_t * m_t / (u_t + epsilon)

        updates[m_prev] = m_t
        updates[u_prev] = u_t
        updates[param] = param - step

    updates[t_prev] = t
    return updates
コード例 #44
0
ファイル: gru4rec.py プロジェクト: yueyedeai/GRU4Rec
 def relu(self,X):
     return T.maximum(X, 0)
コード例 #45
0
    def __init__(self,
                 numpy_rng=None,
                 theano_rng=None,
                 n_h=99,
                 n_s=99,
                 n_v=100,
                 init_from=None,
                 sparse_hmask=None,
                 neg_sample_steps=1,
                 lr_spec=None,
                 lr_timestamp=None,
                 lr_mults={},
                 iscales={},
                 clip_min={},
                 clip_max={},
                 truncation_bound={},
                 l1={},
                 l2={},
                 sp_weight={},
                 sp_targ={},
                 batch_size=13,
                 compile=True,
                 debug=False,
                 seed=1241234,
                 my_save_path=None,
                 save_at=None,
                 save_every=None,
                 flags={},
                 max_updates=5e5):
        """
        :param n_h: number of h-hidden units
        :param n_v: number of visible units
        :param iscales: optional dictionary containing initialization scale for each parameter
        :param neg_sample_steps: number of sampling updates to perform in negative phase.
        :param l1: hyper-parameter controlling amount of L1 regularization
        :param l2: hyper-parameter controlling amount of L2 regularization
        :param batch_size: size of positive and negative phase minibatch
        :param compile: compile sampling and learning functions
        :param seed: seed used to initialize numpy and theano RNGs.
        """
        Model.__init__(self)
        Block.__init__(self)
        assert lr_spec is not None
        for k in ['h']:
            assert k in sp_weight.keys()
        for k in ['h']:
            assert k in sp_targ.keys()
        self.validate_flags(flags)

        self.jobman_channel = None
        self.jobman_state = {}
        self.register_names_to_del(['jobman_channel'])

        ### make sure all parameters are floatX ###
        for (k, v) in l1.iteritems():
            l1[k] = npy_floatX(v)
        for (k, v) in l2.iteritems():
            l2[k] = npy_floatX(v)
        for (k, v) in sp_weight.iteritems():
            sp_weight[k] = npy_floatX(v)
        for (k, v) in sp_targ.iteritems():
            sp_targ[k] = npy_floatX(v)
        for (k, v) in clip_min.iteritems():
            clip_min[k] = npy_floatX(v)
        for (k, v) in clip_max.iteritems():
            clip_max[k] = npy_floatX(v)

        # dump initialization parameters to object
        for (k, v) in locals().iteritems():
            if k != 'self': setattr(self, k, v)

        # allocate random number generators
        self.rng = numpy.random.RandomState(
            seed) if numpy_rng is None else numpy_rng
        self.theano_rng = RandomStreams(self.rng.randint(
            2**30)) if theano_rng is None else theano_rng

        ############### ALLOCATE PARAMETERS #################
        # allocate symbolic variable for input
        self.input = T.matrix('input')
        self.init_parameters()
        self.init_chains()

        # learning rate, with deferred 1./t annealing
        self.iter = sharedX(0.0, name='iter')

        if lr_spec['type'] == 'anneal':
            num = lr_spec['init'] * lr_spec['start']
            denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter)
            self.lr = T.maximum(lr_spec['floor'], num / denum)
        elif lr_spec['type'] == 'linear':
            lr_start = npy_floatX(lr_spec['start'])
            lr_end = npy_floatX(lr_spec['end'])
            self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX(
                self.max_updates)
        else:
            raise ValueError('Incorrect value for lr_spec[type]')

        # configure input-space (new pylearn2 feature?)
        self.input_space = VectorSpace(n_v)
        self.output_space = VectorSpace(n_h)

        self.batches_seen = 0  # incremented on every batch
        self.examples_seen = 0  # incremented on every training example
        self.force_batch_size = batch_size  # force minibatch size

        self.error_record = []

        if compile: self.do_theano()

        #### load layer 1 parameters from file ####
        if init_from:
            self.load_params(init_from)
コード例 #46
0
def dprime_loss(outXPos, outYPos, outXNeg, outYNeg, margin=5, alpha=0.5):
    norm2Pos = norm2(outXPos - outYPos)
    norm2Neg = norm2(outXNeg - outYNeg)
    return norm2Pos.std() + norm2Neg.std() + norm2Pos.mean() + T.maximum(
        0.0, margin - norm2Neg.mean())
コード例 #47
0
def siamese_loss(outXPos, outYPos, outXNeg, outYNeg, margin=5, alpha=0.5):
    lossData = (1 - alpha) * norm2Sqr(outXPos - outYPos) + alpha * T.sqr(
        T.maximum(0.0, margin - norm2(outXNeg - outYNeg)))
    return lossData.mean()
コード例 #48
0
ファイル: cifar10_big.py プロジェクト: dapeter/BinaryNet
def run(binary=False, noise=None, nalpha=0, result_path=None):
    # BN parameters
    batch_size = 128
    print("batch_size = " + str(batch_size))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # Training parameters
    num_epochs = 150
    print("num_epochs = " + str(num_epochs))

    # Dropout parameters
    dropout_in = .2  # default: .2
    print("dropout_in = " + str(dropout_in))
    dropout_hidden = .5  # default: .5
    print("dropout_hidden = " + str(dropout_hidden))

    # BinaryOut
    if binary:
        activation = binary_net.binary_tanh_unit
        print("activation = binary_net.binary_tanh_unit")
    else:
        activation = lasagne.nonlinearities.tanh
        print("activation = lasagne.nonlinearities.tanh")

    # BinaryConnect
    print("binary = " + str(binary))
    stochastic = False
    print("stochastic = " + str(stochastic))
    # (-H,+H) are the two binary values
    # H = "Glorot"
    H = 1.
    print("H = " + str(H))
    # W_LR_scale = 1.
    W_LR_scale = "Glorot"  # "Glorot" means we are using the coefficients from Glorot's paper
    print("W_LR_scale = " + str(W_LR_scale))

    # Decaying LR
    LR_start = 0.005
    print("LR_start = " + str(LR_start))
    LR_fin = 0.0000005  # 0.0000003
    print("LR_fin = " + str(LR_fin))
    LR_decay = (LR_fin / LR_start)**(1. / num_epochs)
    print("LR_decay = " + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    train_set_size = 40000
    shuffle_parts = 1
    print("shuffle_parts = " + str(shuffle_parts))

    print("noise = " + str(noise))
    print("nalpha = " + str(nalpha))

    print('Loading CIFAR-10 dataset...')
    cifar = CifarReader("./data/cifar-10-batches-py/")

    train_X, train_y = cifar.get_train_data(n_samples=train_set_size,
                                            noise=noise,
                                            alpha=nalpha)
    valid_X, valid_y = cifar.get_validation_data()
    test_X, test_y = cifar.get_test_data()
    print("train_set_size = " + str(train_y.shape[0]))
    print("validation_set_size = " + str(valid_y.shape[0]))
    print("test_set_size = " + str(test_y.shape[0]))

    # Log output
    with open(result_path + "params.txt", "a+") as l:
        print("batch_size = " + str(batch_size), file=l)
        print("alpha = " + str(alpha), file=l)
        print("epsilon = " + str(epsilon), file=l)
        print("num_epochs = " + str(num_epochs), file=l)
        print("dropout_in = " + str(dropout_in), file=l)
        print("dropout_hidden = " + str(dropout_hidden), file=l)
        if binary:
            print("activation = binary_net.binary_tanh_unit", file=l)
        else:
            print("activation = lasagne.nonlinearities.tanh", file=l)
        print("binary = " + str(binary), file=l)
        print("stochastic = " + str(stochastic), file=l)
        print("H = " + str(H), file=l)
        print("W_LR_scale = " + str(W_LR_scale), file=l)
        print("LR_start = " + str(LR_start), file=l)
        print("LR_fin = " + str(LR_fin), file=l)
        print("LR_decay = " + str(LR_decay), file=l)
        print("shuffle_parts = " + str(shuffle_parts), file=l)
        print("noise = " + str(noise), file=l)
        print("nalpha = " + str(nalpha), file=l)
        print("train_set_size = " + str(train_y.shape[0]), file=l)
        print("validation_set_size = " + str(valid_y.shape[0]), file=l)
        print("test_set_size = " + str(test_y.shape[0]), file=l)

    # bc01 format
    # Inputs in the range [-1,+1]
    # print("Inputs in the range [-1,+1]")
    train_X = np.reshape(np.subtract(np.multiply(2. / 255., train_X), 1.),
                         (-1, 3, 32, 32))
    valid_X = np.reshape(np.subtract(np.multiply(2. / 255., valid_X), 1.),
                         (-1, 3, 32, 32))
    test_X = np.reshape(np.subtract(np.multiply(2. / 255., test_X), 1.),
                        (-1, 3, 32, 32))

    # flatten targets
    train_y = np.hstack(train_y)
    valid_y = np.hstack(valid_y)
    test_y = np.hstack(test_y)

    # Onehot the targets
    train_y = np.float32(np.eye(10)[train_y])
    valid_y = np.float32(np.eye(10)[valid_y])
    test_y = np.float32(np.eye(10)[test_y])

    # for hinge loss
    train_y = 2 * train_y - 1.
    valid_y = 2 * valid_y - 1.
    test_y = 2 * test_y - 1.

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_in)

    # 32C3-64C3-P2
    cnn = binary_net.Conv2DLayer(cnn,
                                 binary=binary,
                                 stochastic=stochastic,
                                 H=H,
                                 W_LR_scale=W_LR_scale,
                                 num_filters=32,
                                 filter_size=(3, 3),
                                 pad=1,
                                 nonlinearity=lasagne.nonlinearities.identity)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = binary_net.Conv2DLayer(cnn,
                                 binary=binary,
                                 stochastic=stochastic,
                                 H=H,
                                 W_LR_scale=W_LR_scale,
                                 num_filters=64,
                                 filter_size=(3, 3),
                                 pad=1,
                                 nonlinearity=lasagne.nonlinearities.identity)

    cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden)

    # 128C3-256C3-P2
    cnn = binary_net.Conv2DLayer(cnn,
                                 binary=binary,
                                 stochastic=stochastic,
                                 H=H,
                                 W_LR_scale=W_LR_scale,
                                 num_filters=128,
                                 filter_size=(3, 3),
                                 pad=1,
                                 nonlinearity=lasagne.nonlinearities.identity)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = binary_net.Conv2DLayer(cnn,
                                 binary=binary,
                                 stochastic=stochastic,
                                 H=H,
                                 W_LR_scale=W_LR_scale,
                                 num_filters=256,
                                 filter_size=(3, 3),
                                 pad=1,
                                 nonlinearity=lasagne.nonlinearities.identity)

    cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden)

    # 512FP-10FP
    cnn = binary_net.DenseLayer(cnn,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=512)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation)

    cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden)

    cnn = binary_net.DenseLayer(cnn,
                                binary=binary,
                                stochastic=stochastic,
                                H=H,
                                W_LR_scale=W_LR_scale,
                                nonlinearity=lasagne.nonlinearities.identity,
                                num_units=10)

    cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha)

    cnn = lasagne.layers.NonlinearityLayer(
        cnn, nonlinearity=lasagne.nonlinearities.softmax)

    train_output = lasagne.layers.get_output(cnn, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if binary:

        # W updates
        W = lasagne.layers.get_all_params(cnn, binary=True)
        W_grads = binary_net.compute_grads(loss, cnn)
        updates = lasagne.updates.adam(loss_or_grads=W_grads,
                                       params=W,
                                       learning_rate=LR)
        updates = binary_net.clipping_scaling(updates, cnn)

        # other parameters updates
        params = lasagne.layers.get_all_params(cnn,
                                               trainable=True,
                                               binary=False)
        updates.update(
            lasagne.updates.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR))

    else:
        params = lasagne.layers.get_all_params(cnn, trainable=True)
        updates = lasagne.updates.adam(loss_or_grads=loss,
                                       params=params,
                                       learning_rate=LR)

    test_output = lasagne.layers.get_output(cnn, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    binary_net.train(train_fn,
                     val_fn,
                     cnn,
                     batch_size,
                     LR_start,
                     LR_decay,
                     num_epochs,
                     train_X,
                     train_y,
                     valid_X,
                     valid_y,
                     test_X,
                     test_y,
                     shuffle_parts=shuffle_parts,
                     result_path=result_path)
コード例 #49
0
ファイル: network3.py プロジェクト: yinizhizhu/Deep-Learning
def ReLU(z): return T.maximum(0.0, z)
from theano.tensor.nnet import sigmoid
コード例 #50
0
def ReLU(z):
    return T.maximum(0.0, z)
コード例 #51
0
def f(q_i, D_gt_id, tparams, is_train, trng, options):

    # Use search engine again to compute the reward/metrics given a query.
    search = Search(options)

    # append the unknown vector for words whose index = -1.
    W_ = tensor.concatenate([tparams['W'], tparams['UNK']], axis=0)

    q_m = (q_i > -2).astype('float32')

    #get embeddings for the queries
    q_a = W_[q_i.flatten()].reshape(
        (q_i.shape[0], q_i.shape[1], prm.dim_emb)) * q_m[:, :, None]

    if len(prm.filters_query) > 0:
        q_aa = conv_query(q_a, tparams)
    else:
        q_aa = q_a

    q_a_avg = q_a.sum(1) / tensor.maximum(1., q_m.sum(1, keepdims=True))

    out = []
    for n_iter in range(prm.n_iterations):

        if n_iter == 0 and prm.q_0_fixed_until >= prm.n_iterations:
            prob = tensor.zeros((q_a.shape[0], prm.max_words_input, 2))
            bl = tensor.zeros((q_a.shape[0], ))
            D_m_r = tensor.zeros((q_a.shape[0], prm.max_words_input))
        else:
            if n_iter > 0:
                D_m_ = (D_i_ > -2).astype('float32')
                D_a_ = W_[D_i_.flatten()].reshape(
                    (D_i_.shape[0], D_i_.shape[1], D_i_.shape[2],
                     prm.dim_emb)) * D_m_[:, :, :, None]
            else:
                D_a_ = 1. * q_a[:, None, :, :]
                D_m_ = 1. * q_m[:, None, :]

            if len(prm.filters_cand) > 0:
                D_aa_ = conv_cand(D_a_, tparams, 0)
            else:
                D_aa_ = D_a_

            D_aa_ = tensor.dot(D_aa_, tparams['Ad']) + tparams['bAd']

            if n_iter > 0:
                if prm.q_0_fixed_until < 2:
                    D_a = tensor.concatenate([D_a, D_a_], axis=1)
                    D_aa = tensor.concatenate([D_aa, D_aa_], axis=1)
                    D_m = tensor.concatenate([D_m, D_m_], axis=1)
                else:
                    D_a = D_a_
                    D_aa = D_aa_
                    D_m = D_m_
            else:
                D_a = D_a_
                D_aa = D_aa_
                D_m = D_m_

            D_a_r = D_a.reshape((D_a.shape[0], -1, D_a.shape[3]))
            D_aa_r = D_aa.reshape((D_aa.shape[0], -1, D_aa.shape[3]))

            D_m_r = D_m.reshape((D_m.shape[0], -1))

            q_aa_avg = q_aa.sum(1) / tensor.maximum(1.,
                                                    q_m.sum(1, keepdims=True))
            q_aa_att = q_aa_avg[:, None, :]
            q_aa_att = tensor.dot(q_aa_att, tparams['Aq'])

            z = D_aa_r + q_aa_att

            # estimate reward based on the query.
            bl = theano.gradient.grad_scale(z, 0.1)
            D_m_r_c = theano.gradient.disconnected_grad(D_m_r)
            bl = bl.sum(1) / tensor.maximum(1., D_m_r_c.sum(1))[:, None]
            for i in range(len(prm.n_hidden_critic) + 1):
                if prm.dropout > 0:
                    bl = dropout_layer(bl, is_train, trng)
                bl = tensor.maximum(0., bl)
                bl = tensor.dot(bl,
                                tparams['C' + str(i)]) + tparams['bC' + str(i)]

            bl = tensor.tanh(bl)
            bl = bl.flatten()

            for i in range(len(prm.n_hidden_actor) + 1):
                if prm.dropout > 0:
                    z = dropout_layer(z, is_train, trng)
                z = tensor.maximum(0., z)
                z = tensor.dot(z,
                               tparams['V' + str(i)]) + tparams['bV' + str(i)]

            prob = softmax_mask(z) * D_m_r[:, :, None]

            # if training, sample. Otherwise, pick maximum probability.
            s = trng.multinomial(n=1,
                                 pvals=prob.reshape((-1, 2)),
                                 dtype=prob.dtype)
            s = s.reshape((prob.shape[0], prob.shape[1], prob.shape[2]))

            #if frozen is enabled and this iteration is within its limit, pick maximum probability.
            if prm.frozen_until > 0:
                if n_iter < prm.frozen_until:
                    s = prob

            res = tensor.eq(is_train, 1.) * s + tensor.eq(is_train, 0.) * prob

            # final answer & valid words
            ans = res.argmax(2) * D_m_r

        if n_iter < prm.q_0_fixed_until:
            ones = tensor.ones((q_a.shape[0], prm.max_words_input))
            if n_iter > 0:
                # select everything from the original query in the first iteration.
                ans = tensor.concatenate([ones, ans], axis=1)
            else:
                ans = ones

        metrics, D_i_, D_id_, D_gt_m_ = search(ans, D_gt_id, n_iter, is_train)

        out.append([prob, ans, metrics, bl, D_m_r, D_id_])

    return out
コード例 #52
0
ファイル: train_SQUAD_conv.py プロジェクト: oneproton/SQUAD
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=2000,
                    batch_size=100,
                    emb_size=10,
                    hidden_size=10,
                    L2_weight=0.0001,
                    para_len_limit=400,
                    q_len_limit=40,
                    max_EM=0.217545454546):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train(
        para_len_limit, q_len_limit)
    train_size = len(train_para_list)
    if train_size != len(train_Q_list) or train_size != len(
            train_label_list) or train_size != len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test(
        word2id, para_len_limit, q_len_limit)
    test_size = len(test_para_list)
    if test_size != len(test_Q_list) or test_size != len(
            test_mask) or test_size != len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    rand_values = random_value_normal((overall_vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    #     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #     id2word = {y:x for x,y in overall_word2id.iteritems()}
    #     word2vec=load_word2vec()
    #     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    labels = T.imatrix('labels')
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')
    extraF = T.ftensor3('extraF')  # should be in shape (batch, wordsize, 3)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    norm_extraF = normalize_matrix(extraF)

    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b]

    UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size)
    Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]

    W_a1 = create_ensemble_para(
        rng, hidden_size,
        hidden_size)  # init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size + 3)  # 3 extra features
    LR_b = theano.shared(
        value=numpy.zeros((2, ),
                          dtype=theano.config.floatX),  # @UndefinedVariable
        name='LR_b',
        borrow=True)

    attention_paras = [W_a1, W_a2, U_a, LR_b]
    params = [embeddings] + paragraph_para + Q_para + attention_paras

    load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params)

    paragraph_input = embeddings[paragraph.flatten()].reshape(
        (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose(
            (0, 2, 1))  # (batch_size, emb_size, maxparalen)
    concate_paragraph_input = T.concatenate(
        [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1)

    paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=paragraph_input,
        Mask=para_mask,
        hidden_dim=hidden_size,
        U=U1,
        W=W1,
        b=b1,
        Ub=U1_b,
        Wb=W1_b,
        bb=b1_b)
    para_reps = paragraph_model.output_tensor  #(batch, emb, para_len)

    #     #LSTM
    #     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
    #     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
    #     para_reps=paragraph_model.output_tensor

    Qs_emb = embeddings[questions.flatten()].reshape(
        (questions.shape[0], questions.shape[1], emb_size)).transpose(
            (0, 2, 1))  #(#questions, emb_size, maxsenlength)

    questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=Qs_emb,
        Mask=q_mask,
        hidden_dim=hidden_size,
        U=UQ,
        W=WQ,
        b=bQ,
        Ub=UQ_b,
        Wb=WQ_b,
        bb=bQ_b)
    #     questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    questions_reps_tensor = questions_model.output_tensor

    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)

    #     #LSTM for questions
    #     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
    #     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
    #     questions_reps_tensor=questions_model.output_tensor

    #use CNN for question modeling
    #     Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen)
    #     conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5))
    #     Q_conv_para=[conv_W, conv_b]
    #     conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4,
    #             image_shape=(batch_size, 1, emb_size, q_len_limit),
    #             filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b)
    #     conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1)
    #     gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1))
    #     masked_conv_output=conv_output*gru_mask
    #     questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size))

    #     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
    #     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
    #     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
    #     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
    #     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1)
    #     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1)
    #     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)

    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix = para_matrix.T
        interaction_matrix = T.dot(transpose_para_matrix,
                                   q_matrix)  #(para_len, q_len)
        norm_interaction_matrix = T.nnet.softmax(interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T)  #(len, para_len)

    batch_q_reps, updates = theano.scan(
        fn=example_in_batch,
        outputs_info=None,
        sequences=[para_reps, questions_reps_tensor
                   ])  #batch_q_reps (batch, hidden, para_len)

    #attention distributions

    norm_W_a1 = normalize_matrix(W_a1)
    norm_W_a2 = normalize_matrix(W_a2)
    norm_U_a = normalize_matrix(U_a)

    transformed_para_reps = T.maximum(
        T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0)  #relu
    transformed_q_reps = T.maximum(
        T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0)
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)

    add_both = transformed_para_reps + transformed_q_reps

    #     U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size)
    #     U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size)
    #     accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b]
    #     accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b)
    #     accu_both=accumu_model.output_tensor.transpose((0,2,1))

    prior_att = T.concatenate([add_both, norm_extraF], axis=2)

    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices = para_mask.flatten().nonzero()[0]

    layer3 = LogisticRegression(rng,
                                input=prior_att.reshape(
                                    (batch_size * prior_att.shape[1],
                                     hidden_size + 3)),
                                n_in=hidden_size + 3,
                                n_out=2,
                                W=norm_U_a,
                                b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.sum(
        T.log(layer3.p_y_given_x)
        [valid_indices,
         labels.flatten()[valid_indices]])  #[T.arange(y.shape[0]), y])

    distributions = layer3.p_y_given_x[:, -1].reshape(
        (batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    #     masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask
    masked_dis = distributions * para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()
    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''

    #     def AttentionLayer(q_rep, ext_M):
    #         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
    #         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
    #        f __name__ == '__main__':
    #         prior_att=T.concatenate([prior_att, ext_M], axis=1)
    #
    #         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
    #         return strength.transpose() #(1, #words)

    #     distributions, updates = theano.scan(
    #     AttentionLayer,
    #     sequences=[questions_reps,extraF] )

    #     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
    #     labels=debug_print(labels, 'labels')
    #     label_mask=T.gt(labels,0.0)
    #     neg_label_mask=T.lt(labels,0.0)
    #     dis_masked=distributions*label_mask
    #     remain_dis_masked=distributions*neg_label_mask
    #     pos_error=((dis_masked-1)**2).mean()
    #     neg_error=((remain_dis_masked-(-1))**2).mean()
    #     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    L2_reg = L2norm_paraList(
        [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost = error  #+ConvGRU_1.error#

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [paragraph, questions, labels, para_mask, q_mask, extraF],
        cost,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function(
        [paragraph, questions, para_mask, q_mask, extraF],
        masked_dis,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(
        numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            #             haha=para_mask[para_id:para_id+batch_size]
            #             print haha
            #             for i in range(batch_size):
            #                 print len(haha[i])
            cost_i += train_model(
                np.asarray([
                    train_para_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_Q_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_label_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_para_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_feature_matrixlist[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX))

            #print iter
            if iter % 10 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match = 0.0
                F1_match = 0.0
                q_amount = 0
                for test_para_id in test_batch_start:
                    distribution_matrix = test_model(
                        np.asarray(test_para_list[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype='int32'),
                        np.asarray(test_Q_list[test_para_id:test_para_id +
                                               batch_size],
                                   dtype='int32'),
                        np.asarray(test_para_mask[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(test_mask[test_para_id:test_para_id +
                                             batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(
                            test_feature_matrixlist[test_para_id:test_para_id +
                                                    batch_size],
                            dtype=theano.config.floatX))

                    #                     print distribution_matrix
                    test_para_wordlist_list = test_text_list[
                        test_para_id:test_para_id + batch_size]
                    para_gold_ansset_list = q_ansSet_list[
                        test_para_id:test_para_id + batch_size]
                    paralist_extra_features = test_feature_matrixlist[
                        test_para_id:test_para_id + batch_size]
                    sub_para_mask = test_para_mask[test_para_id:test_para_id +
                                                   batch_size]
                    para_len = len(test_para_wordlist_list[0])
                    if para_len != len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(
                            distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount += batch_size
                    #                     print q_size
                    #                     print test_para_word_list

                    Q_list_inword = test_Q_list_word[
                        test_para_id:test_para_id + batch_size]
                    for q in range(batch_size):  #for each question
                        #                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
                        #                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
                        #                         else:
                        #                             ss=len(distribution_matrix[q])
                        #                             combine_list=[]
                        #                             for ii in range(ss):
                        #                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
                        #                             print combine_list
                        #                         exit(0)
                        #                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans = extract_ansList_attentionList(
                            test_para_wordlist_list[q], distribution_matrix[q],
                            np.asarray(paralist_extra_features[q],
                                       dtype=theano.config.floatX),
                            sub_para_mask[q], Q_list_inword[q])
                        q_gold_ans_set = para_gold_ansset_list[q]
                        #                         print test_para_wordlist_list[q]
                        #                         print Q_list_inword[q]
                        #                         print pred_ans.encode('utf8'), q_gold_ans_set
                        if pred_ans in q_gold_ans_set:
                            exact_match += 1
                        F1 = MacroF1(pred_ans, q_gold_ans_set)
                        F1_match += F1


#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc = F1_match / q_amount
                exact_acc = exact_match / q_amount
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(
                            rootPath + 'Best_Paras_conv_' + str(max_exact_acc),
                            params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #53
0
 def _magnitude(x, axis=-1):
     return T.sqrt(
         T.maximum(T.sqr(x).sum(axis=axis),
                   numpy.finfo(x.dtype).tiny))
コード例 #54
0
def spatial_gradient(prediction, target, l=0.1,m=2.):
    # Flatten input to make calc easier
    pred = prediction
    pred_v = pred.flatten(2)
    target_v = target.flatten(2)
    # Compute mask
    mask = T.gt(target_v,0.)
    # Compute n of valid pixels
    n_valid = T.sum(mask, axis=1)
    # Apply mask and log transform
    m_pred = pred_v * mask
    m_t = T.switch(mask, T.log(target_v),0.)
    d = m_pred - m_t

    # Define scale invariant cost
    scale_invariant_cost = (T.sum(n_valid * T.sum(d**2, axis=1)) - l*T.sum(T.sum(d, axis=1)**2))/ T.maximum(T.sum(n_valid**2), 1)

    # Add spatial gradient components from D. Eigen DNL

    # Squeeze in case
    if pred.ndim == 4:
        pred = pred[:,0,:,:]
    if target.ndim == 4:
        target = target[:,0,:,:]
    # Mask in tensor form
    mask_tensor = T.gt(target,0.)
    # Project into log space
    target = T.switch(mask_tensor, T.log(target),0.)
    # Stepsize
    h = 1
    # Compute spatial gradients symbolically
    p_di = (pred[:,h:,:] - pred[:,:-h,:]) * (1 / np.float32(h))
    p_dj = (pred[:,:,h:] - pred[:,:,:-h]) * (1 / np.float32(h))
    t_di = (target[:,h:,:] - target[:,:-h,:]) * (1 / np.float32(h))
    t_dj = (target[:,:,h:] - target[:,:,:-h]) * (1 / np.float32(h))
    m_di = T.and_(mask_tensor[:,h:,:], mask_tensor[:,:-h,:])
    m_dj = T.and_(mask_tensor[:,:,h:], mask_tensor[:,:,:-h])
    # Define spatial grad cost
    grad_cost = T.sum(m_di * (p_di - t_di)**2) / T.sum(m_di) + T.sum(m_dj * (p_dj - t_dj)**2) / T.sum(m_dj)
    # Compute final expression
    return scale_invariant_cost + grad_cost
コード例 #55
0
def scale_invariant_error(predictions, targets):
    """
    Scale invariant error in log space
    :param predictions: Prediction tensor
    :param targets: Target tensor
    :return: theano expression
    """
    _lambda_ = 0.5

    # Flatten input to make calc easier
    pred = predictions.flatten(2)
    target = targets.flatten(2)
    # Compute mask
    mask = T.gt(target, 0)
    # Compute n of valid pixels
    n_valid = T.sum(mask, axis=1)
    # Apply mask and log transform
    m_pred = pred * mask
    m_t = T.switch(mask, T.log(target), 0)
    d = m_pred - m_t

    # Define cost
    return (T.sum(n_valid * T.sum(d ** 2, axis=1)) - _lambda_ * T.sum(T.sum(d, axis=1) ** 2)) / T.maximum(
        T.sum(n_valid ** 2), 1)
コード例 #56
0
 def relu(self, x):
     return T.maximum(x, 0)
コード例 #57
0
 def get_output_for(self, input, training=False, **kwargs):
     if training:
         R = (T.max(input, axis=1) - T.min(input, axis=1)).dimshuffle(
             0, 'x')
         input = self.temp * input / T.maximum(R, 0.1)
     return T.exp(input) / T.sum(T.exp(input), axis=1).dimshuffle(0, 'x')
コード例 #58
0
def rectifier(x):
    return tensor.maximum(0., x)
コード例 #59
0
ファイル: pool.py プロジェクト: yixiu00001/Theano
    def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)):
        """Return the shape of the output from this op, for input of given
        shape and flags.

        Parameters
        ----------
        imgshape : tuple of integers or scalar Theano variables
            the shape of a tensor of images. The last two elements are
            interpreted as the number of rows, and the number of cols.
        ds : tuple of two ints
            downsample factor over rows and columns this parameter
            indicates the size of the pooling region
        st : tuple of two ints
            the stride size. This is the distance between the pooling
            regions. If it's set to None, in which case it equlas ds.
        ignore_border : bool
            if ds doesn't divide imgshape, do we include an extra
            row/col of partial downsampling (False) or ignore it
            (True).
        padding : tuple of two ints
            (pad_h, pad_w), pad zeros to extend beyond four borders of
            the images, pad_h is the size of the top and bottom
            margins, and pad_w is the size of the left and right
            margins.

        Returns
        -------
        list :
            the shape of the output from this op, for input of given
            shape.  This will have the same length as imgshape, but
            with last two elements reduced as per the downsampling &
            ignore_border flags.

        """
        if len(imgshape) < 2:
            raise TypeError('imgshape must have at least two elements '
                            '(rows, cols)')

        if st is None:
            st = ds
        r, c = imgshape[-2:]
        r += padding[0] * 2
        c += padding[1] * 2

        if ignore_border:
            out_r = (r - ds[0]) // st[0] + 1
            out_c = (c - ds[1]) // st[1] + 1
            if isinstance(r, theano.Variable):
                nr = tensor.maximum(out_r, 0)
            else:
                nr = numpy.maximum(out_r, 0)
            if isinstance(c, theano.Variable):
                nc = tensor.maximum(out_c, 0)
            else:
                nc = numpy.maximum(out_c, 0)
        else:
            if isinstance(r, theano.Variable):
                nr = tensor.switch(
                    tensor.ge(st[0], ds[0]), (r - 1) // st[0] + 1,
                    tensor.maximum(0, (r - 1 - ds[0]) // st[0] + 1) + 1)
            elif st[0] >= ds[0]:
                nr = (r - 1) // st[0] + 1
            else:
                nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1

            if isinstance(c, theano.Variable):
                nc = tensor.switch(
                    tensor.ge(st[1], ds[1]), (c - 1) // st[1] + 1,
                    tensor.maximum(0, (c - 1 - ds[1]) // st[1] + 1) + 1)
            elif st[1] >= ds[1]:
                nc = (c - 1) // st[1] + 1
            else:
                nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1

        rval = list(imgshape[:-2]) + [nr, nc]
        return rval
コード例 #60
0
ファイル: blockDropout.py プロジェクト: prithv1/condnet
def build_model(new_model=True):
    momentum_epsilon = 0.9

    block_size = 64
    nblocks = [10, 10]
    rate = [.16, .16]
    L2reg = 0.001

    is_uniform_policy = True

    lambda_b = [40, 20]
    lambda_v = [20, 20]
    learning_rates = [0.01, 0.5]

    print locals()

    hyperparams = locals()

    if new_model:
        expid = str(uuid.uuid4())
        import os
        import os.path
        code = file(os.path.abspath(__file__), 'r').read()
        os.mkdir(expid)
        os.chdir(expid)
        file('code.py', 'w').write(code)

        print expid

        f = file("params.txt", 'w')
        for i in hyperparams:
            f.write("%s:%s\n" % (i, str(hyperparams[i])))
        f.close()

    params = []
    reinforce_params = []
    shared.bind(reinforce_params, "reinforce")
    shared.bind(params)

    rect = lambda x: T.maximum(0, x)
    act = T.tanh

    model = StackModel([
        PolicyDropoutLayer(32 * 32 * 3, block_size * nblocks[0], block_size,
                           act, rate[0]),
        PolicyDropoutLayer(block_size * nblocks[0], block_size * nblocks[1],
                           block_size, act, rate[1]),
        InputSparseHiddenLayer(block_size * nblocks[1],
                               10,
                               T.nnet.softmax,
                               block_size=block_size)
    ])

    x = T.matrix()
    y = T.ivector()
    lr = T.scalar()

    y_hat, = model(x)
    loss = T.nnet.categorical_crossentropy(y_hat, y)
    cost = T.sum(loss)
    l2 = lambda x: sum([T.sum(i**2) for i in x])
    updates = []
    all_probs = []
    for i in []:  #range(len(model.layers)-1):
        probs = model.layers[i].probs
        sample_probs = model.layers[i].sample_probs
        layer_params = [model.layers[i].d.W, model.layers[i].d.b]
        all_probs.append(probs)

        l2_batchwise = lambda_b[i] * T.sum(
            abs(T.mean(probs, axis=0) - rate[i])**2)
        l2_exawise = lambda_b[i] * 0.001 * T.sum(
            abs(T.mean(probs, axis=1) - rate[i])**2)
        batch_var = lambda_v[i] * T.sum(T.var(probs, axis=0))
        batch_var += lambda_v[i] * 0.1 * T.sum(T.var(probs, axis=1))
        regularising_cost = l2_batchwise + l2_exawise - batch_var + L2reg * l2(
            layer_params)
        updates += reinforce_no_baseline(
            layer_params,
            sample_probs,
            loss - loss.min(),  # momentum_epsilon,
            lr * learning_rates[i],
            regularising_cost)

    error = T.sum(T.neq(y_hat.argmax(axis=1), y))
    nn_regularization = L2reg * l2(params)

    grads = T.grad(cost + nn_regularization, params)
    updates += gradient_descent(params, grads, lr)
    print params, reinforce_params

    learn = theano.function([x, y, lr], [cost, error],
                            updates=updates,
                            allow_input_downcast=True)
    test = theano.function([x, y], [cost, error], allow_input_downcast=True)

    return model, learn, test