def maxout_func(n_out, last_start, pool_size, rectify, lin_output): tmp_output = lin_output[:,0:last_start+1:pool_size] for i in range(1, pool_size): cur = lin_output[:,i:last_start+i+1:pool_size] tmp_output = T.maximum(cur, tmp_output) if rectify: self.tmp_output = T.maximum(0, self.tmp_output)
def grad_init(self): #self.mov_std = theano.shared(numpy.float32(1.), 'std') rewards_ = self.rewards[0] mean_rewards = rewards_.mean() var_rewards = rewards_.var() pp = self.params.values() #mean_rewards = (self.mask * self.rewards).sum(-1, keepdims=True) / tensor.maximum(1., self.mask.sum(-1, keepdims=True)) ##centered_rewards = self.rewards - self.vapprox.v[:,:,0] - mean_rewards centered_rewards = rewards_ - mean_rewards - self.vapprox.v[:,0] #mean2_rewards = (self.mask * (self.rewards ** 2)).sum(-1, keepdims=True) / tensor.maximum(1., self.mask.sum(-1, keepdims=True)) #var_rewards = mean2_rewards - (mean_rewards ** 2) scaled_rewards = centered_rewards / tensor.maximum(1., tensor.sqrt(tensor.maximum(0., var_rewards))) logprob = 0. reg = 0. for oi in xrange(self.n_out): labs = self.actions[:,:,oi].flatten() labs_idx = tensor.arange(labs.shape[0]) * self.out_dim + labs logprob = logprob + ((self.mask * tensor.log(self.pi[oi].flatten()+1e-6)[labs_idx] .reshape([self.actions.shape[0], self.actions.shape[1]])).sum(0)) reg = reg - (self.pi[oi] * tensor.log(self.pi[oi]+1e-6)).sum(-1).sum(0) self.cost = -tensor.mean(scaled_rewards * logprob + self.reg_c * reg) self.grads = tensor.grad(self.cost, wrt=pp)
def penalty(self): penTerms = [] objects = self.scene_obj.shapes for i in xrange(len(objects)-1): obj1 = objects[i] for j in xrange(i,len(objects)): obj2 = objects[j] #Distance check between objects center1 = obj1.o2w.m[:,3] center2 = obj2.o2w.m[:,3] radius1 = T.maximum(obj1.o2w.m[0,0], obj1.o2w.m[1,1]) radius2 = T.maximum(obj2.o2w.m[0,0], obj2.o2w.m[1,1]) max_rad = T.maximum(radius1, radius2) #TODO remake it for batch dist = T.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2) penflag = T.switch(dist < max_rad, 1, 0) #Computing the overlapping area of two circle (is not working...) #R = radius1 #r = radius2 #A = r**2 * T.arccos( (dist**2 + r**2 - R**2)/(2*dist*r)) \ # + R**2 * T.arccos( (dist**2-R**2 +r**2) / (2*dist*R) )\ # - 0.5 * T.sqrt((-dist + r +R)*(dist+r+R)*(dist-r+R)*(dist+r+R)) penTerms = T.sum(penflag * (np.pi * max_rad**2)*2)# + (1-penflag) * A ) return penTerms
def f_encode_decode(w, train=True): results = {} h = x_enc(_x - .5, w) obj_kl = G.sharedf(0.) # bottom-up encoders for i in range(len(depths)): for j in range(depths[i]): h = layers[i][j].up(h, w) # top-level activations h = T.tile(w['h_top'].dimshuffle('x',0,'x','x'), (_x.shape[0],1,shape_x[1]/2**len(depths), shape_x[2]/2**len(depths))) # top-down priors, posteriors and decoders for i in list(reversed(range(len(depths)))): for j in list(reversed(range(depths[i]))): h, kl = layers[i][j].down_q(h, train, w) kl_sum = kl.sum(axis=(1,2,3)) results['cost_z'+str(i).zfill(3)+'_'+str(j).zfill(3)] = kl_sum # Constraint: Minimum number of bits per featuremap, averaged across minibatch if kl_min > 0: kl = kl.sum(axis=(2,3)).mean(axis=0,dtype=G.floatX) obj_kl += T.maximum(np.asarray(kl_min,G.floatX), kl).sum(dtype=G.floatX) else: obj_kl += kl_sum output = x_dec(x_dec_nl(h, w), w) # empirical distribution if px == 'logistic': mean_x = T.clip(output+.5, 0, 1) logsd_x = 0*mean_x + w['logsd_x'] obj_logpx = N.rand.discretized_logistic(mean_x, logsd_x, 1/256., _x).logp #obj_z = T.printing.Print('obj_z')(obj_z) obj = obj_logpx - obj_kl # Compute the bits per pixel obj *= (1./np.prod(shape_x) * 1./np.log(2.)).astype('float32') #if not '__init' in w: # raise Exception() elif px == 'bernoulli': prob_x = T.nnet.sigmoid(output) prob_x = T.maximum(T.minimum(prob_x, 1-1e-7), 1e-7) #prob_x = T.printing.Print('prob_x')(prob_x) obj_logpx = N.rand.bernoulli(prob_x, _x).logp #obj_logqz = T.printing.Print('obj_logqz')(obj_logqz) #obj_logpz = T.printing.Print('obj_logpz')(obj_logpz) #obj_logpx = T.printing.Print('obj_logpx')(obj_logpx) obj = obj_logpx - obj_kl #obj = T.printing.Print('obj')(obj) results['cost_x'] = -obj_logpx results['cost'] = -obj return results
def __init__(self, factor=numpy.sqrt(2), decay=1.0, min_factor=None, padding=False, **kwargs): super(ConvFMPLayer, self).__init__(**kwargs) if min_factor is None: min_factor = factor factor = T.maximum(factor * (decay ** self.network.epoch), numpy.float32(min_factor)) sizes_raw = self.source.output_sizes # handle size problems if not padding: padding = T.min(self.source.output_sizes / factor) <= 0 padding = theano.printing.Print(global_fn=maybe_print_pad_warning)(padding) fixed_sizes = T.maximum(sizes_raw, T.cast(T.as_tensor( [factor + self.filter_height - 1, factor + self.filter_width - 1]), 'float32')) sizes = ifelse(padding, fixed_sizes, sizes_raw) X_size = T.cast(T.max(sizes, axis=0), "int32") def pad_fn(x_t, s): x = T.alloc(numpy.cast["float32"](0), X_size[0], X_size[1], self.X.shape[3]) x = T.set_subtensor(x[:s[0], :s[1]], x_t[:s[0], :s[1]]) return x fixed_X, _ = theano.scan(pad_fn, [self.X.dimshuffle(2, 0, 1, 3), T.cast(sizes_raw, "int32")]) fixed_X = fixed_X.dimshuffle(1, 2, 0, 3) self.X = ifelse(padding, T.unbroadcast(fixed_X, 3), self.X) conv_out = CuDNNConvHWBCOpValidInstance(self.X, self.W, self.b) conv_out_sizes = self.conv_output_size_from_input_size(sizes) self.output, self.output_sizes = fmp(conv_out, conv_out_sizes, T.cast(factor,'float32'))
def lp_norm(self, n, k, r, c, z): ''' Lp = ( 1/n * sum(|x_i|^p, 1..n))^(1/p) where p = 1 + ln(1+e^P) :param n: :param k: :param r: :param c: :param z: :return: ''' ds0, ds1 = self.pool_size st0, st1 = self.stride pad_h = self.pad[0] pad_w = self.pad[1] row_st = r * st0 row_end = T.minimum(row_st + ds0, self.img_rows) row_st = T.maximum(row_st, self.pad[0]) row_end = T.minimum(row_end, self.x_m2d + pad_h) col_st = c * st1 col_end = T.minimum(col_st + ds1, self.img_cols) col_st = T.maximum(col_st, self.pad[1]) col_end = T.minimum(col_end, self.x_m1d + pad_w) Lp = T.pow( T.mean(T.pow( T.abs_(T.flatten(self.y[n, k, row_st:row_end, col_st:col_end], 1)), 1 + T.log(1 + T.exp(self.P)) )), 1 / (1 + T.log(1 + T.exp(self.P))) ) return T.set_subtensor(z[n, k, r, c], Lp)
def updates(self, cost, params, learning_rate = 0.1, momentum= 0.95, rescale=5.): grads = T.grad(cost, params) grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1e-4 updates = [] for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - learning_rate * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * learning_rate * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def build_loss(embeddings): """Return a pair (loss, p) given a theano shared variable representing the `embeddings`. `loss` is a theano variable for the loss. `p` is a symbolic variable representing the target neighbour probabilities on which the loss depends. """ # Probability that two points are neighbours in the embedding space. emb_dists = distance_matrix(embeddings) emb_top = zero_diagonal(1 / (1 + emb_dists)) emb_bottom = emb_top.sum(axis=0) q = emb_top / emb_bottom # Incorrect normalization which does not matter since we normalize p i # the same way. q /= q.sum() q = T.maximum(q, 1E-12) p_ji_var = T.matrix('neighbour_probabilities') p_ji_var.tag.test_value = np.random.random( (10, 10)).astype(theano.config.floatX) p_ji_var_floored = T.maximum(p_ji_var, 1E-12) # t-distributed stochastic neighbourhood embedding loss. loss = (p_ji_var * T.log(p_ji_var_floored / q)).sum() return loss, p_ji_var
def grad_init(self): mask_ = self.mask.flatten() rewards_ = self.rewards.flatten() actions_ = self.actions.reshape([self.actions.shape[0]*self.actions.shape[1],-1]) #self.mov_std = theano.shared(numpy.float32(1.), 'std') pp = self.params.values() mean_rewards = (mask_ * rewards_).sum(-1, keepdims=True) / tensor.maximum(1., mask_.sum(-1, keepdims=True)) centered_rewards = rewards_ - self.vapprox.v[:,0] - mean_rewards mean2_rewards = (mask_ * (rewards_ ** 2)).sum(-1, keepdims=True) / tensor.maximum(1., mask_.sum(-1, keepdims=True)) var_rewards = mean2_rewards - (mean_rewards ** 2) scaled_rewards = centered_rewards / tensor.maximum(1., tensor.sqrt(tensor.maximum(0., var_rewards))) #scaled_rewards = centered_rewards logprob = 0. reg = 0. for oi in xrange(self.n_out): labs = actions_[:,oi].flatten() labs_idx = tensor.arange(labs.shape[0]) * self.out_dim + labs logprob = logprob + (mask_ * tensor.log(self.pi[oi].flatten()+1e-6)[labs_idx]) reg = reg - (self.pi[oi] * tensor.log(self.pi[oi]+1e-6)).sum(-1).sum(0) self.cost = -tensor.mean(scaled_rewards * logprob + self.reg_c * reg) self.grads = tensor.grad(self.cost, wrt=pp)
def __init__(self, input): #A 3in1 maxpooling self.output_shape = input.output_shape[0]/2, input.output_shape[1] self.origlayer = input self.output = input.output[::2] self.output = T.set_subtensor(self.output[:input.output.shape[0]/2], T.maximum(self.output[:input.output.shape[0]/2], input.output[1::2])) self.output = T.set_subtensor(self.output[1:], T.maximum(self.output[1:], input.output[1:-1:2]))
def _local_error(self, targetM, i): pull_error = 0. ivectors = self._x[:, i, :][self._neighborpairs[:, 0]] jvectors = self._x[:, i, :][self._neighborpairs[:, 1]] diffv = ivectors - jvectors pull_error = linalg.trace(diffv.dot(targetM).dot(diffv.T)) push_error = 0.0 ivectors = self._x[:, i, :][self._set[:, 0]] jvectors = self._x[:, i, :][self._set[:, 1]] lvectors = self._x[:, i, :][self._set[:, 2]] diffij = ivectors - jvectors diffil = ivectors - lvectors lossij = diffij.dot(targetM).dot(diffij.T) lossil = diffil.dot(targetM).dot(diffil.T) mask = T.neq(self._y[self._set[:, 0]], self._y[self._set[:, 2]]) push_error = linalg.trace(mask*T.maximum(lossij - lossil + 1, 0)) self.zerocount = T.eq(linalg.diag(mask*T.maximum(lossij - lossil + 1, 0)), 0).sum() # print np.sqrt((i+1.0)/self.M) # pull_error = pull_error * np.sqrt((i+1.0)/self.M) # push_error = push_error * np.sqrt((i+1.0)/self.M) return pull_error, push_error
def penality(self): penTerms = [] objects = self.scene_obj.shapes for i in xrange(len(objects)-1): obj1 = objects[i] for j in xrange(i,len(objects)): obj2 = objects[j] #Distance check between objects center1 = obj1.o2w.m[:,3] center2 = obj2.o2w.m[:,3] radius1 = T.maximum(obj1.o2w.m[0,0], obj1.o2w.m[1,1]) radius2 = T.maximum(obj2.o2w.m[0,0], obj2.o2w.m[1,1]) max_rad = T.maximum(radius1, radius2) #TODO remake it for batch dist = T.sqrt((center1[0] - center2[0])**2 + (center1[1] - center2[1])**2) penflag = T.switch(dist < max_rad, 1, 0) penTerms = T.sum(penflag * (np.pi * max_rad**2)*2) return penTerms
def infer_shape(self, node, ishapes): mus_shape, prior_mu, prior_sigma = ishapes return [ (tensor.maximum(1, mus_shape[0]),), (tensor.maximum(1, mus_shape[0]),), (tensor.maximum(1, mus_shape[0]),), ]
def getTrainingFunc2(self): input = T.dmatrix() target = T.dvector() learning_rate = T.scalar() y = input for i in xrange(0, self.n_layers-1): y = T.maximum(0.0, T.dot(y, self.params[i*3]) + self.params[i*3+1] ) y = y*self.theano_rng.binomial(y.shape, 1, 0.5) y = T.maximum(0, T.dot(y, self.params[(self.n_layers-1)*3]) + self.params[(self.n_layers-1)*3+1] ) y = T.squeeze(y.T) #y = T.dot(y, self.params[-1]) diff = y - target #regulator = theano.printing.Print('norm:')(T.sum(abs(y))*alpha) #L = theano.printing.Print('L:')(T.sum(diff*diff) + regulator) L = T.sum(diff*diff) #- target*T.log(y) - (1-target)*T.log(1-y) gparam = T.grad(L, [ self.params[i] for i in xrange(len(self.params)) if i%3 != 2 ]) updates = {} for i,p,g,m in zip(xrange(len(gparam)),[ self.params[i] for i in xrange(len(self.params)) if i%3 != 2 ], gparam, [ self.moments[i] for i in xrange(len(self.moments)) if i%3 != 2 ]): if i%2 == 0: updates[m] = 0.9*m - learning_rate*0.0005*p - learning_rate*g else: updates[m] = 0.9*m - learning_rate*g updates[p] = p + m train_func = theano.function( inputs = [input, target, learning_rate], outputs=[L,y], updates= updates) return train_func
def cost(self): known_grads = None xd = self.z.reshape((self.z.shape[0]*self.z.shape[1],self.z.shape[2])) epsilon = numpy.float32(1e-10) # cross-entropy nll, _ = T.nnet.crossentropy_softmax_1hot(x=xd[self.i], y_idx=self.y_data_flat[self.i]) ce = T.sum(nll) # entropy def entropy(p, axis=None): if self.use_max and axis is not None: q = p.dimshuffle(axis, *(range(axis) + range(axis+1,p.ndim))) #return -T.mean(T.log(T.maximum(T.max(q,axis=0),epsilon))) return -T.mean(T.max(q,axis=0)+epsilon) + T.log(T.cast(p.shape[axis],'float32')) else: return -T.mean(p*T.log(p+epsilon)) + T.log(T.cast(p.shape[axis],'float32')) ez = T.exp(self.z) * T.cast(self.index.dimshuffle(0,1,'x').repeat(self.z.shape[2],axis=2), 'float32') et = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=0,keepdims=True)),axis=0) eb = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=1,keepdims=True)),axis=1) ed = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=2,keepdims=True)),axis=2) # maximize entropy across T and B and minimize entropy across D e = self.e_d * ed - (self.e_t * et + self.e_b * eb) / numpy.float32(self.e_t + self.e_b) import theano.ifelse if self.train_flag: return theano.ifelse.ifelse(T.cast(self.xflag,'int8'),e,ce), known_grads else: return ce, known_grads
def __init__(self, input, input_shape, filter_shape, border_mode="valid") : # input : theano symbolic variable of input, 4D tensor # input_shape : shape of input / (minibatch size, input channel num, image height, image width) # filter_shape : shape of filter / (# of new channels to make, input channel num, filter height, filter width) # initialize W (weight) randomly rng = np.random.RandomState(int(time.time())) w_bound = math.sqrt(filter_shape[1] * filter_shape[2] * filter_shape[3]) self.W1 = theano.shared(np.asarray(rng.uniform(low=-1.0/w_bound, high=1.0/w_bound, size=filter_shape), dtype=theano.config.floatX), name='W', borrow=True) self.W2 = theano.shared(np.asarray(rng.uniform(low=-1.0/w_bound, high=1.0/w_bound, size=filter_shape), dtype=theano.config.floatX), name='W', borrow=True) self.W3 = theano.shared(np.asarray(rng.uniform(low=-1.0/w_bound, high=1.0/w_bound, size=filter_shape), dtype=theano.config.floatX), name='W', borrow=True) # initialize b (bias) with zeros self.b1 = theano.shared(np.asarray(np.zeros(filter_shape[0],), dtype=theano.config.floatX), name='b', borrow=True) self.b2 = theano.shared(np.asarray(np.zeros(filter_shape[0],), dtype=theano.config.floatX), name='b', borrow=True) self.b3 = theano.shared(np.asarray(np.zeros(filter_shape[0],), dtype=theano.config.floatX), name='b', borrow=True) # convolution & sigmoid calculation #self.conv_out = conv.conv2d(input, self.W, image_shape=input_shape, filter_shape=filter_shape) #self.output = 1.7159*T.tanh((self.conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))*(2.0/3.0)) # maxout : 3 out1 = conv.conv2d(input, self.W1, image_shape=input_shape, filter_shape=filter_shape, border_mode=border_mode) + self.b1.dimshuffle('x', 0, 'x', 'x') out2 = conv.conv2d(input, self.W2, image_shape=input_shape, filter_shape=filter_shape, border_mode=border_mode) + self.b2.dimshuffle('x', 0, 'x', 'x') out3 = conv.conv2d(input, self.W3, image_shape=input_shape, filter_shape=filter_shape, border_mode=border_mode) + self.b3.dimshuffle('x', 0, 'x', 'x') self.output = T.maximum(out1, T.maximum(out2, out3)) # save parameter of this layer for back-prop convinience self.params = [self.W1, self.W2, self.W3, self.b1, self.b2, self.b3] insize = input_shape[1] * input_shape[2] * input_shape[3] self.paramins = [insize, insize, insize, insize, insize, insize]
def convolutional_model(X, w_1, w_2, w_3, w_4, w_5, w_6, p_1, p_2, p_3, p_4, p_5): l1 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(X, w_1, border_mode='full'),0.), (2, 2),ignore_border=True) + b_1.dimshuffle('x', 0, 'x', 'x') ), p_1) l2 = dropout(T.tanh( max_pool_2d(T.maximum(conv2d(l1, w_2), 0.), (2, 2),ignore_border=True) + b_2.dimshuffle('x', 0, 'x', 'x') ), p_2) l3 = dropout(T.flatten(T.tanh( max_pool_2d(T.maximum(conv2d(l2, w_3), 0.), (2, 2),ignore_border=True) + b_3.dimshuffle('x', 0, 'x', 'x') ), outdim=2), p_3)# flatten to switch back to 1d layers l4 = dropout(T.maximum(T.dot(l3, w_4), 0.), p_4) l5 = dropout(T.maximum(T.dot(l4, w_5), 0.), p_5) return T.dot(l5, w_6)
def iter_j(in_matrix, j_out_matrix,out_matrix,k_img_matrix): j_out_len=j_out_matrix.shape[0] jentity=j_out_matrix[j_out_len-2:] j_out_matrix=j_out_matrix[:j_out_len-2] score_img_seq=T.maximum(0,seq_score(out_matrix,in_matrix,entity)+1- seq_score(out_matrix,k_img_matrix,entity)) score_sent_seq=T.maximum(0,seq_score(j_out_matrix,k_img_matrix,jentity)+1- seq_score(out_matrix,k_img_matrix,entity)) return score_img_seq+score_sent_seq
def inner(target, embedding): """Return a theano expression of a vector containing the sample wise loss of drlim. The push_margin, pull_margin and coefficient for the contrastives used are %.f, %.f and %.f respectively. Parameters ---------- target : array_like A vector of length `n`. If 1, sample `2 * n` and sample `2 * n + 1` are deemed similar. embedding : array_like Array containing the embeddings of samples row wise. """ % (push_margin, pull_margin, c_contrastive) target = target[:, 0] n_pair = embedding.shape[0] // 2 n_feature = embedding.shape[1] # Reshape array to get pairs. embedding = embedding.reshape((n_pair, n_feature * 2)) # Calculate distances of pairs. diff = (embedding[:, :n_feature] - embedding[:, n_feature:]) dist = T.sqrt((diff ** 2).sum(axis=1) + 1e-8) pull = target * f_pull_loss(T.maximum(0, dist - pull_margin)) push = (1 - target) * f_push_loss(T.maximum(0, push_margin - dist)) loss = pull + c_contrastive * push return loss.dimshuffle(0, 'x')
def _build_activation(self, act=None): '''Given an activation description, return a callable that implements it. ''' def compose(a, b): c = lambda z: b(a(z)) c.__theanets_name__ = '%s(%s)' % (b.__theanets_name__, a.__theanets_name__) return c act = act or self.args.activation.lower() if '+' in act: return reduce(compose, (self._build_activation(a) for a in act.split('+'))) options = { 'tanh': TT.tanh, 'linear': lambda z: z, 'logistic': TT.nnet.sigmoid, 'softplus': TT.nnet.softplus, # shorthands 'relu': lambda z: TT.maximum(0, z), # modifiers 'rect:max': lambda z: TT.minimum(1, z), 'rect:min': lambda z: TT.maximum(0, z), # normalization 'norm:dc': lambda z: (z.T - z.mean(axis=1)).T, 'norm:max': lambda z: (z.T / TT.maximum(1e-10, abs(z).max(axis=1))).T, 'norm:std': lambda z: (z.T / TT.maximum(1e-10, TT.std(z, axis=1))).T, } for k, v in options.iteritems(): v.__theanets_name__ = k try: return options[act] except: raise KeyError('unknown --activation %s' % act)
def discriminator(x, z, params, mb_size, num_hidden, num_latent): x_z = T.concatenate([x,z], axis = 1) h_out_1 = DenseLayer((mb_size, num_hidden + num_latent), num_units = num_hidden, nonlinearity=None, W = params['W_disc_1']) h_out_2 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_2']) h_out_3 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_3']) h_out_4 = DenseLayer((mb_size, 1), num_units = 1, nonlinearity=None, W = params['W_disc_4'], b = params['b_disc_4']) h_out_1_value = h_out_1.get_output_for(x_z) h_out_1_value = T.maximum(0.0, (h_out_1_value - T.mean(h_out_1_value, axis = 0)) / (1.0 + T.std(h_out_1_value, axis = 0)) + params['b_disc_1']) h_out_2_value = h_out_2.get_output_for(h_out_1_value) h_out_2_value = T.maximum(0.0, (h_out_2_value - T.mean(h_out_2_value, axis = 0)) / (1.0 + T.std(h_out_2_value, axis = 0)) + params['b_disc_2']) h_out_3_value = h_out_3.get_output_for(h_out_2_value) h_out_3_value = T.maximum(0.0, (h_out_3_value - T.mean(h_out_3_value, axis = 0)) / (1.0 + T.std(h_out_3_value, axis = 0)) + params['b_disc_3']) h_out_4_value = h_out_4.get_output_for(h_out_3_value) raw_y = h_out_4_value classification = T.nnet.sigmoid(raw_y) results = {'c' : classification} return results
def crop_attention_bilinear(bbox, frame): att = bbox frame_col = img_col frame_row = img_row _cx = (att[1] + att[3]) / 2; cx = (_cx + 1) / 2. * frame_col _cy = (att[0] + att[2]) / 2; cy = (_cy + 1) / 2. * frame_row _w = TT.abs_(att[3] - att[1]) / 2; w = _w * frame_col _h = TT.abs_(att[2] - att[0]) / 2; h = _h * frame_row dx = w / (att_col - 1) dy = h / (att_row - 1) mx = cx + dx * (TT.arange(att_col, dtype=T.config.floatX) - (att_col - 1) / 2.) my = cy + dy * (TT.arange(att_row, dtype=T.config.floatX) - (att_row - 1) / 2.) a = TT.arange(frame_col, dtype=T.config.floatX) b = TT.arange(frame_row, dtype=T.config.floatX) ax = TT.maximum(0, 1 - TT.abs_(a.dimshuffle(0, 'x') - mx.dimshuffle('x', 0))) by = TT.maximum(0, 1 - TT.abs_(b.dimshuffle(0, 'x') - my.dimshuffle('x', 0))) bilin = TT.dot(by.T, TT.dot(frame, ax)) return bilin
def getOutputs(self, previousMemory, input_layer): print "prev memory dim", previousMemory.ndim print "input layer dim", input_layer.ndim assert(previousMemory.ndim == input_layer.ndim) if previousMemory.ndim == 1: axisConcat = 0 else: axisConcat = 1 controller_0 = T.maximum(0.0, T.dot(input_layer, self.params["W_controller_0"]) + self.params["b_controller_0"]) controller_1 = T.maximum(0.0, T.dot(T.concatenate([controller_0, input_layer], axis = axisConcat), self.params["W_controller_1"]) + self.params["b_controller_1"]) controller = T.maximum(0.0, T.dot(T.concatenate([controller_1, input_layer], axis = axisConcat), self.params["W_controller"]) + self.params["b_controller"]) #Have multiple layers in controller? This determines what gets passed in / out from the network. if self.useReluReadGate: readgate = T.maximum(0.0, (T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_readgate"].T) + self.params["b_readgate"])) else: readgate = T.nnet.sigmoid(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_readgate"].T) + self.params["b_readgate"]) readdelta = T.tanh(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_readdelta"].T) + self.params["b_readdelta"]) keepgate = T.nnet.sigmoid(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_keepgate"].T) + self.params["b_keepgate"]) memory = previousMemory * keepgate + readgate * readdelta writegate = T.nnet.sigmoid(T.dot(T.concatenate([controller, input_layer], axis = axisConcat), self.params["W_writegate"].T) + self.params["b_writegate"]) output = writegate * T.maximum(0.0, T.dot(T.concatenate([controller, 0.0 * memory, 1.0 * input_layer], axis = axisConcat), self.params["W_output"].T) + self.params["b_output"]) return memory, output
def minimize(self, loss, momentum, rescale): super(RMSPropOptimizer, self).minimize(loss) grads = self.gradparams grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads))) not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) grad_norm = T.sqrt(grad_norm) scaling_num = rescale scaling_den = T.maximum(rescale, grad_norm) # Magic constants combination_coeff = 0.9 minimum_grad = 1E-4 updates = [] params = self.params for n, (param, grad) in enumerate(zip(params, grads)): grad = T.switch(not_finite, 0.1 * param, grad * (scaling_num / scaling_den)) old_square = self.running_square_[n] new_square = combination_coeff * old_square + ( 1. - combination_coeff) * T.sqr(grad) old_avg = self.running_avg_[n] new_avg = combination_coeff * old_avg + ( 1. - combination_coeff) * grad rms_grad = T.sqrt(new_square - new_avg ** 2) rms_grad = T.maximum(rms_grad, minimum_grad) memory = self.memory_[n] update = momentum * memory - self.lr * grad / rms_grad update2 = momentum * momentum * memory - ( 1 + momentum) * self.lr * grad / rms_grad updates.append((old_square, new_square)) updates.append((old_avg, new_avg)) updates.append((memory, update)) updates.append((param, param + update2)) return updates
def advanced_indexing(volume, *indices_list, **kwargs): """ Performs advanced indexing on `volume`. This function exists because in Theano<=0.9 advanced indexing is only supported along the first dimension. Notes ----- Assuming `volume` is C contiguous. """ strides = kwargs.get("strides") if strides is None: shapes = T.cast(volume.shape[:len(indices_list)], dtype=theano.config.floatX) strides = T.concatenate([T.ones((1,)), T.cumprod(shapes[::-1])[:-1]], axis=0)[::-1] shapes = T.cast(volume.shape, dtype=theano.config.floatX) indices = T.maximum(0, T.minimum(indices_list[-1], shapes[len(indices_list)-1]-1)) for i in range(len(indices_list)-1): clipped_idx = T.maximum(0, T.minimum(indices_list[i], shapes[i]-1)) indices += clipped_idx * strides[i] # indices = T.sum(T.stack(indices_list, axis=1)*strides[:len(indices_list)], axis=1) indices = T.cast(indices, dtype="int32") return volume.reshape((-1, volume.shape[-1]))[indices]
def init_lpool(self, x, x_shp, ker_shape=(3, 3), order=1, stride=1, mode='valid'): if hasattr(order, '__iter__'): o1 = (order == 1).all() o2 = (order == order.astype(np.int)).all() else: o1 = order == 1 o2 = (order == int(order)) if o1: r, r_shp = self.boxconv(x, x_shp, ker_shape) elif o2: r, r_shp = self.boxconv(x ** order, x_shp, ker_shape) r = tensor.maximum(r, 0) ** (1.0 / order) else: r, r_shp = self.boxconv(abs(x) ** order, x_shp, ker_shape) r = tensor.maximum(r, 0) ** (1.0 / order) if stride > 1: r = r[:, :, ::stride, ::stride] # intdiv is tricky... so just use numpy r_shp = np.empty(r_shp)[:, :, ::stride, ::stride].shape return r, r_shp
def get_updates(self, v): # Contrastive divergence chain_end, updates_CD = self.CD(self, chain_start=v, cdk=self.CDk) # [Expected] negative log-likelihood cost = T.mean(self.free_energy(v), axis=0) - T.mean(self.free_energy(chain_end), axis=0) # L2 Regularization if isinstance(self.regularize, L2Regularization): cost += self.regularization # Gradients (use automatic differentiation) # We must not compute the gradient through the gibbs sampling, i.e. use consider_constant gparams = T.grad(cost, self.parameters, consider_constant=[chain_end]) gradients = dict(zip(self.parameters, gparams)) # Get learning rates for all params given their gradient. lr, updates_lr = self.learning_rate(gradients) updates = OrderedDict() updates.update(updates_CD) # Add updates from CD updates.update(updates_lr) # Add updates from learning_rate # Updates parameters for param, gparam in gradients.items(): updates[param] = param - lr[param] * gradients[param] if isinstance(self.regularize, L1Regularization): updates[self.b] = T.sgn(updates[self.b]) * T.maximum(abs(updates[self.b]) - lr[self.b]*self.regularize.decay, 0) updates[self.W] = T.sgn(updates[self.W]) * T.maximum(abs(updates[self.W]) - lr[self.W]*self.regularize.decay, 0) return updates
def setup(self, bottom, top): from caffe_helper.theano_util import init_theano init_theano() import theano as tn import theano.tensor as T assert len(bottom) == 2 assert len(top) == 1 s_y = T.matrix('y') # y in [-inf, inf] s_t = T.matrix('t') # t in {-1, 0, 1} where 0 is ignored s_dloss = T.scalar('dloss') # Forward # s_loss = T.mean(abs(s_t) * T.log1p(T.exp(-s_y * s_t))) # unstable s_loss = -T.sum( abs(s_t) * ( s_y * ((s_t >= 0) - (s_y >= 0)) - T.log1p(T.exp(-abs(s_y)))))\ / T.maximum(T.sum(abs(s_t)), 1) # Backward s_p = 1 / (1 + T.exp(-s_y)) s_dy = s_dloss * abs(s_t) * (s_p - (s_t >= 0)) / \ T.maximum(T.sum(abs(s_t)), 1) def _o(s): return tn.Out(s, borrow=True) self.tn_forward = tn.function([s_y, s_t], s_loss) self.tn_backward = tn.function([s_y, s_t, s_dloss], _o(s_dy))
def call(self, X): if type(X) is not list or len(X) != 2: raise Exception("SquareAttention must be called on a list of two tensors. Got: " + str(X)) frame, position = X[0], X[1] # Reshaping the input to exclude the time dimension frameShape = K.shape(frame) positionShape = K.shape(position) (chans, height, width) = frameShape[-3:] targetDim = positionShape[-1] frame = K.reshape(frame, (-1, chans, height, width)) position = K.reshape(position, (-1, ) + (targetDim, )) # Applying the attention hw = THT.abs_(position[:, 2] - position[:, 0]) * self.scale / 2.0 hh = THT.abs_(position[:, 3] - position[:, 1]) * self.scale / 2.0 position = THT.maximum(THT.set_subtensor(position[:, 0], position[:, 0] - hw), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 2], position[:, 2] + hw), 1.0) position = THT.maximum(THT.set_subtensor(position[:, 1], position[:, 1] - hh), -1.0) position = THT.minimum(THT.set_subtensor(position[:, 3], position[:, 3] + hh), 1.0) rX = Data.linspace(-1.0, 1.0, width) rY = Data.linspace(-1.0, 1.0, height) FX = THT.gt(rX, position[:,0].dimshuffle(0,'x')) * THT.le(rX, position[:,2].dimshuffle(0,'x')) FY = THT.gt(rY, position[:,1].dimshuffle(0,'x')) * THT.le(rY, position[:,3].dimshuffle(0,'x')) m = FY.dimshuffle(0, 1, 'x') * FX.dimshuffle(0, 'x', 1) m = m + self.alpha - THT.gt(m, 0.) * self.alpha frame = frame * m.dimshuffle(0, 'x', 1, 2) # Reshaping the frame to include time dimension output = K.reshape(frame, frameShape) return output
def tiled_eye(n1, n2, dtype="float32"): r1 = T.maximum((n1 - 1) / n2 + 1, 1) r2 = T.maximum((n2 - 1) / n1 + 1, 1) small_eye = T.eye(T.minimum(n1, n2), dtype=dtype) tiled_big = T.tile(small_eye, (r1, r2)) tiled_part = tiled_big[:n1,:n2] return tiled_part
def dist_info_sym(self, obs_var, state_info_vars=None): mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std], obs_var) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) return dict(mean=mean_var, log_std=log_std_var)
left_features = U.dot(x_u.T) right_features = V.dot(x_v.T) prediction = T.diagonal(T.dot(left_features.T, right_features)) # LOSS FUNCTIONS # squared loss loss_squared = T.mean((y - prediction)**2) # logistic loss (0/1 classiifcation) prob_prediction = T.nnet.sigmoid(prediction) loss_log = -T.mean(y * T.log(prob_prediction) + (1 - y) * T.log(1 - prob_prediction)) # e-insensitive loss epsilon = 0.1 loss_e_insens = T.mean( T.maximum(prediction - y - epsilon, T.maximum(0, y - prediction - epsilon))) # hinge loss (-1/1 classification) loss_hinge = T.mean(T.maximum(-prediction * y + epsilon, 0)) # PENALTIES cost = loss_log + lambda_u * T.mean(U**2) + lambda_v * T.mean(V**2) gU, gV = T.grad(cost, [U, V]) learning_rate = 1e-7 momentum_factor = 0.9 train = theano.function(inputs=[y, x_u, x_v], outputs=cost, updates=((U, U + delta_U), (V, V + delta_V), (delta_U, momentum_factor * delta_U - (1 - momentum_factor) * learning_rate * gU),
discrete=discrete, H=H, N=N, nonlinearity=lasagne.nonlinearities.identity, #identity num_units=10) cnn = lasagne.layers.BatchNormLayer( cnn, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(cnn, deterministic=False) best_params = lasagne.layers.get_all_params(cnn, discrete=True) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output))) if discrete: updates = discrete_grads(loss,cnn,LR) params = lasagne.layers.get_all_params(cnn, trainable=True, discrete=False) updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(cnn, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(cnn, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX)
def __init__( self, env_spec, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), min_std=1e-6, npz_path=None, freeze_lst=None, reinit_lst=None, std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, mean_network=None, std_network=None, dist_cls=DiagonalGaussian, ): """ :param env_spec: :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std :return: """ Serializable.quick_init(self, locals()) # reinit_lst = None assert isinstance(env_spec.action_space, Box) if init_std is None: init_std = 1.0 set_std_params = False else: set_std_params = True obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network if mean_network is None: mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._mean_network = mean_network self._layers_mean = mean_network.layers l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if std_network is not None: l_log_std = std_network.output_layer else: if adaptive_std: std_network = MLP( input_shape=(obs_dim, ), input_layer=mean_network.input_layer, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, ) l_log_std = std_network.output_layer self._layers_log_std = std_network.layers else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._layers_log_std = [l_log_std] self._layers = self._layers_mean + self._layers_log_std self.min_std = min_std mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(min_std)) self._mean_var, self._log_std_var = mean_var, log_std_var self._l_mean = l_mean self._l_log_std = l_log_std self._dist = dist_cls(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], ) if npz_path is not None: param_dict = dict( np.load(os.path.join(config.PROJECT_PATH, npz_path))) param_values = param_dict['params'] # todo: don't forget about this if set_std_params: self.set_param_values(param_values) else: self.set_param_values_transfer(param_values) if freeze_lst is not None: assert len(freeze_lst) == len(self._layers) - 1 for layer, should_freeze in zip(self._layers[1:], freeze_lst): if should_freeze: for param, tags in layer.params.items(): tags.remove("trainable") if reinit_lst is not None: assert len(freeze_lst) == len( self._layers) - 1 # since input layer is counted for layer, should_reinit in zip(self._layers[1:], reinit_lst): if should_reinit: print("reinitialized") for v in layer.params: val = v.get_value() if (len(val.shape) < 2): v.set_value(lasagne.init.Constant(0.0)(val.shape)) else: v.set_value(lasagne.init.GlorotUniform()( val.shape)) else: print("did not reinit")
def std_norm(_x, axis=[-3, -2, -1]): return _x / T.maximum(1e-4, T.std(_x, axis=axis, keepdims=True))
def var_norm(_x, imgs=True, axis=[-3, -2, -1]): if imgs: return (_x - T.mean(_x, axis=axis, keepdims=True)) / T.maximum( 1e-4, T.std(_x, axis=axis, keepdims=True)) return (_x - T.mean(_x)) / T.maximum(1e-4, T.std(_x))
def ReLU(x): return T.maximum(0.0, x)
def rectify(X): return T.maximum(X, 0.)
def run_training(monitor_filename=None, random_seed=config.random_seed, coeff_embed=config.coeff_embed): # For multi run seeds, close current log files. for tap in [stdout_tap, stderr_tap]: if tap.file is not None: tap.file.close() tap.file = None # Sanity check network type. if config.network_type not in ['pi', 'tempens']: print("Unknown network type '%s'." % config.network_type) exit() np.random.seed(random_seed) # Create the result directory and basic run data. run_desc = config.run_desc + ( '_%s%s_%04d_embed%.1f' % (config.dataset_str, config.num_labels_str, random_seed, coeff_embed)) result_subdir = report.create_result_subdir(config.result_dir, run_desc) print("Saving results to", result_subdir) # Start dumping stdout and stderr into result directory. stdout_tap.set_file(open(os.path.join(result_subdir, 'stdout.txt'), 'wt')) stderr_tap.set_file(open(os.path.join(result_subdir, 'stderr.txt'), 'wt')) # Set window title if on Windows. try: import ctypes ctypes.windll.kernel32.SetConsoleTitleA( '%s - Gpu %d' % (os.path.split(result_subdir)[1], config.cuda_device_number)) except: pass # Export run information. report.export_sources(os.path.join(result_subdir, 'src')) report.export_run_details(os.path.join(result_subdir, 'run.txt')) report.export_config(os.path.join(result_subdir, 'config.txt')) # Load the dataset. print("Loading dataset '%s'..." % config.dataset) if config.dataset == 'cifar-10': X_train, y_train, X_test, y_test = load_cifar_10() elif config.dataset == 'cifar-100': X_train, y_train, X_test, y_test = load_cifar_100() elif config.dataset == 'svhn': X_train, y_train, X_test, y_test = load_svhn() elif config.dataset == 'mnist': X_train, y_train, X_test, y_test = load_mnist_realval() else: print("Unknown dataset '%s'." % config.dataset) exit() # Calculate number of classes. num_classes = len(set(y_train)) assert (set(y_train) == set(y_test) == set(range(num_classes)) ) # Check that all labels are in range [0, num_classes-1] print("Found %d classes in training set, %d in test set." % (len(set(y_train)), len(set(y_test)))) # Prepare dataset and print stats. X_train, y_train, mask_train, X_test, y_test = prepare_dataset( result_subdir, X_train, y_train, X_test, y_test, num_classes) print("Got %d training inputs, out of which %d are labeled." % (len(X_train), sum(mask_train))) print("Got %d test inputs." % len(X_test)) print("Shapes:", X_train.shape, y_train.shape, X_test.shape, y_test.shape) # ---------------------------------------------------------------------------- # Prepare to train. # ---------------------------------------------------------------------------- print("Network type is '%s'." % config.network_type) # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') label_var = T.ivector('labels') learning_rate_var = T.scalar('learning_rate') adam_beta1_var = T.scalar('adam_beta1') input_vars = [input_var] scaled_unsup_weight_max = config.unsup_weight_max if config.num_labels != 'all': scaled_unsup_weight_max *= 1.0 * config.num_labels / X_train.shape[0] if config.network_type == 'pi': input_b_var = T.tensor4('inputs_b') mask_var = T.vector('mask') unsup_weight_var = T.scalar('unsup_weight') input_vars.append(input_b_var) elif config.network_type == 'tempens': mask_var = T.vector('mask') target_var = T.matrix('targets') unsup_weight_var = T.scalar('unsup_weight') # Load/create the network. if config.load_network_filename is not None: net, net_em, input_var = load_network(config.load_network_filename) input_vars = [input_var] if config.network_type == 'pi': input_vars.append(input_b_var) else: print("Building network and compiling functions...") net, net_em = build_network(input_var, X_train.shape[1], num_classes) # Export topology report. with open(os.path.join(result_subdir, 'network-topology.txt'), 'wt') as fout: for line in report.generate_network_topology_info(net): print(line) fout.write(line + '\n') # Initialization updates and function. ll.get_output(net, init=True) init_updates = [ u for l in ll.get_all_layers(net) for u in getattr(l, 'init_updates', []) ] init_fn = theano.function(input_vars, [], updates=init_updates, on_unused_input='ignore') # Get training predictions, BN updates. train_prediction, train_embedding = ll.get_output([net, net_em]) # train_embedding = ll.get_output(net_em) if config.network_type == 'pi': train_prediction_b = ll.get_output( net, inputs=input_b_var) # Second branch. bn_updates = [ u for l in ll.get_all_layers(net) for u in getattr(l, 'bn_updates', []) ] # Training loss. train_loss = T.mean(categorical_crossentropy(train_prediction, label_var) * mask_var, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) if config.network_type == 'pi': if config.consis: train_loss += unsup_weight_var * T.mean( squared_error(train_prediction, train_prediction_b), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) target_hard = T.argmax(train_prediction_b, axis=1) elif config.network_type == 'tempens': if config.consis: train_loss += unsup_weight_var * T.mean( squared_error(train_prediction, target_var), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) target_hard = T.argmax(target_var, axis=1) if config.merge is True: merged_tar = mask_var * \ T.cast(label_var, dtype=theano.config.floatX) \ + (1. - mask_var) * \ T.cast(target_hard, dtype=theano.config.floatX) else: merged_tar = target_hard emb_eucd2 = T.mean(squared_error( train_embedding[:config.minibatch_size // 2], train_embedding[config.minibatch_size // 2:]), axis=1) neighbor_var = T.eq(merged_tar[:config.minibatch_size // 2], merged_tar[config.minibatch_size // 2:]) emb_eucd = T.sqrt(emb_eucd2) margin = T.constant(config.margin, dtype=theano.config.floatX, name='margin') neighbor_var = T.cast(neighbor_var, dtype=theano.config.floatX) pos = neighbor_var * emb_eucd2 neg = (1. - neighbor_var) * T.square(T.maximum(margin - emb_eucd, 0)) emb_loss = T.mean(pos + neg) train_loss += unsup_weight_var * emb_loss * coeff_embed # Entropy minimization if config.coeff_entropy: train_loss += config.coeff_entropy * unsup_weight_var * T.mean( lasagne.objectives.categorical_crossentropy( train_prediction, train_prediction)) # ADAM update expressions for training. params = ll.get_all_params(net, trainable=True) updates = robust_adam(train_loss, params, learning_rate=learning_rate_var, beta1=adam_beta1_var, beta2=config.adam_beta2, epsilon=config.adam_epsilon).items() # EMA param_avg = [ theano.shared(np.cast[theano.config.floatX](0. * p.get_value())) for p in params ] avg_updates = [(a, a + config.ema_decay * (p - a)) for p, a in zip(params, param_avg)] avg_givens = [(p, a) for p, a in zip(params, param_avg)] # Training function. if config.network_type == 'pi': train_fn = theano_utils.function([ input_var, input_b_var, label_var, mask_var, learning_rate_var, adam_beta1_var, unsup_weight_var ], [train_loss], updates=updates + bn_updates + avg_updates, on_unused_input='warn') elif config.network_type == 'tempens': train_fn = theano_utils.function([ input_var, label_var, mask_var, target_var, learning_rate_var, adam_beta1_var, unsup_weight_var ], [train_loss, train_prediction], updates=updates + bn_updates + avg_updates, on_unused_input='warn') # Validation prediction, loss, and accuracy. test_prediction = ll.get_output(net, deterministic=True) test_loss = T.mean(categorical_crossentropy(test_prediction, label_var), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), label_var), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) # EMA output function. # ema_fn = theano_utils.function([input_var], [test_prediction], on_unused_input='warn', givens=avg_givens) # Validation function. val_fn = theano_utils.function([input_var, label_var], [test_loss, test_acc], on_unused_input='warn', givens=avg_givens) # ---------------------------------------------------------------------------- # Start training. # ---------------------------------------------------------------------------- print("Starting training.") if config.max_unlabeled_per_epoch is not None: print("Limiting number of unlabeled inputs per epoch to %d." % config.max_unlabeled_per_epoch) training_csv = report.GenericCSV( os.path.join(result_subdir, 'training.csv'), 'Epoch', 'EpochTime', 'TrainLoss', 'TestLoss', 'TestAccuracy', 'LearningRate') # Initial training variables for temporal ensembling. if config.network_type == 'tempens': ensemble_prediction = np.zeros((len(X_train), num_classes)) training_targets = np.zeros((len(X_train), num_classes)) # ---------------------------------------------------------------------------- # Training loop. # ---------------------------------------------------------------------------- for epoch in range(config.start_epoch, config.num_epochs): # Export network snapshot every 50 epochs. if (epoch % 50) == 0 and epoch != config.start_epoch: save_network( net, os.path.join(result_subdir, 'network-snapshot-%03d.pkl' % epoch)) # Evaluate up/down ramps. rampup_value = rampup(epoch) rampdown_value = rampdown(epoch) # Initialize WN/MOBN layers with a properly augmented minibatch. if epoch == 0: if config.network_type == 'pi': minibatches = iterate_minibatches_augment_pi( X_train, np.zeros((len(X_train), )), np.zeros((len(X_train), )), config.minibatch_size) for (n, indices, inputs_a, inputs_b, labels, mask) in minibatches: init_fn(inputs_a, inputs_b) break elif config.network_type == 'tempens': minibatches = iterate_minibatches_augment_tempens( X_train, np.zeros((len(X_train), )), np.zeros((len(X_train), )), np.zeros((len(X_train), )), config.minibatch_size) for (n, indices, inputs, labels, mask, targets) in minibatches: init_fn(inputs) break # Initialize epoch predictions for temporal ensembling. if config.network_type == 'tempens': epoch_predictions = np.zeros((len(X_train), num_classes)) epoch_execmask = np.zeros( len(X_train)) # Which inputs were executed. training_targets = floatX(training_targets) # Training pass. start_time = time.time() train_err, train_n = 0., 0. learning_rate = rampup_value * rampdown_value * config.learning_rate_max adam_beta1 = rampdown_value * config.adam_beta1 + ( 1.0 - rampdown_value) * config.rampdown_beta1_target unsup_weight = rampup_value * scaled_unsup_weight_max if epoch == config.start_epoch: unsup_weight = 0.0 with thread_utils.ThreadPool(8) as thread_pool: if config.network_type == 'pi': minibatches = iterate_minibatches_augment_pi( X_train, y_train, mask_train, config.minibatch_size) minibatches = thread_utils.run_iterator_concurrently( minibatches, thread_pool) for (n, indices, inputs_a, inputs_b, labels, mask) in minibatches: (e_train, ) = train_fn(inputs_a, inputs_b, labels, mask, floatX(learning_rate), floatX(adam_beta1), floatX(unsup_weight)) train_err += e_train * n train_n += n elif config.network_type == 'tempens': minibatches = iterate_minibatches_augment_tempens( X_train, y_train, mask_train, training_targets, config.minibatch_size) minibatches = thread_utils.run_iterator_concurrently( minibatches, thread_pool) for (n, indices, inputs, labels, mask, targets) in minibatches: (e_train, prediction) = train_fn(inputs, labels, mask, targets, floatX(learning_rate), floatX(adam_beta1), floatX(unsup_weight)) for i, j in enumerate(indices): epoch_predictions[j] = prediction[ i] # Gather epoch predictions. epoch_execmask[j] = 1.0 train_err += e_train * n train_n += n # Test pass. val_err, val_acc, val_n = 0., 0., 0. with thread_utils.ThreadPool(8) as thread_pool: minibatches = iterate_minibatches(X_test, y_test, config.minibatch_size) minibatches = thread_utils.run_iterator_concurrently( minibatches, thread_pool) for (n, inputs, labels) in minibatches: err, acc = val_fn(inputs, labels) val_err += err * n val_acc += acc * n val_n += n if config.network_type == 'tempens': if config.max_unlabeled_per_epoch is None: # Basic mode. ensemble_prediction = ( config.prediction_decay * ensemble_prediction ) + (1.0 - config.prediction_decay) * epoch_predictions training_targets = ensemble_prediction / ( 1.0 - config.prediction_decay**( (epoch - config.start_epoch) + 1.0)) else: # Sparse updates. epoch_execmask = epoch_execmask.reshape(-1, 1) ensemble_prediction = epoch_execmask * ( config.prediction_decay * ensemble_prediction + (1.0 - config.prediction_decay) * epoch_predictions) + ( 1.0 - epoch_execmask) * ensemble_prediction training_targets = ensemble_prediction / ( np.sum(ensemble_prediction, axis=1, keepdims=True) + 1e-8 ) # Normalize # Export stats. training_csv.add_data(epoch, time.time() - start_time, train_err / train_n, val_err / val_n, val_acc / val_n * 100.0, learning_rate) # Export progress monitor data. if monitor_filename is not None: with open(monitor_filename, 'wt') as f: json.dump( { "loss": 1.0 - val_acc / val_n, "cur_epoch": (epoch + 1), "max_epoch": config.num_epochs }, f) # Print stats. print( "Epoch %3d of %3d took %6.3fs Loss %.7f, %.7f Acc=%5.2f LR=%.7f" % (epoch, config.num_epochs, time.time() - start_time, train_err / train_n, val_err / val_n, val_acc / val_n * 100.0, learning_rate)) # ---------------------------------------------------------------------------- # Save and exit. # ---------------------------------------------------------------------------- training_csv.close() print("Saving the final network.") np.savez(os.path.join(result_subdir, 'network-final.npz'), *ll.get_all_param_values(net)) save_network(net, os.path.join(result_subdir, 'network-final.pkl')) print("Done.")
def rectify(flatten_input_matrix): return T.maximum(flatten_input_matrix, 0.)
def rectify(self, X): return T.maximum(X, 0.)
def run_experiment(self, dataset, word_embedding, exp_name): # load parameters num_maps_word = self.options["num_maps_word"] drop_rate_word = self.options["drop_rate_word"] drop_rate_sentence = self.options["drop_rate_sentence"] word_window = self.options["word_window"] word_dim = self.options["word_dim"] k_max_word = self.options["k_max_word"] k_max_sentence = self.options["k_max_sentence"] batch_size = self.options["batch_size"] rho = self.options["rho"] epsilon = self.options["epsilon"] norm_lim = self.options["norm_lim"] max_iteration = self.options["max_iteration"] k_portion = self.options["k_portion"] num_maps_sentence = self.options["num_maps_sentence"] sentence_window = self.options["sentence_window"] sentence_len = len(dataset[0][0][0][0]) sentence_num = len(dataset[0][0][0]) # compute the sentence flags train_flags, test_flags = construct_sentence_flag(dataset) train_k_value = construct_dynamic_k(train_flags, k_portion) test_k_value = construct_dynamic_k(test_flags, k_portion) train_flags = theano.shared(value=np.asarray( train_flags, dtype=theano.config.floatX), borrow=True) test_flags = theano.shared(value=np.asarray( test_flags, dtype=theano.config.floatX), borrow=True) train_k = theano.shared(value=np.asarray(train_k_value, dtype=theano.config.floatX), borrow=True) test_k = theano.shared(value=np.asarray(test_k_value, dtype=theano.config.floatX), borrow=True) # define the parameters x = T.tensor3("x") y = T.ivector("y") sen_flags = T.matrix("flag") sen_k = T.matrix("sen_k") rng = np.random.RandomState(1234) words = theano.shared(value=np.asarray(word_embedding, dtype=theano.config.floatX), name="embedding", borrow=True) zero_vector_tensor = T.vector() zero_vec = np.zeros(word_dim, dtype=theano.config.floatX) set_zero = theano.function( [zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) x_emb = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], words.shape[1])) dropout_x_emb = nn.dropout_from_layer(rng, x_emb, drop_rate_word) # compute convolution on words layer word_filter_shape = (num_maps_word, 1, word_window, word_dim) word_pool_size = (sentence_len - word_window + 1, 1) dropout_word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb, input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word) sent_vec_dim = num_maps_word * k_max_word dropout_sent_vec = dropout_word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) dropout_sent_vec = nn.dropout_from_layer(rng, dropout_sent_vec, drop_rate_sentence) word_conv = nn.ConvPoolLayer(rng, input=dropout_x_emb * (1 - drop_rate_word), input_shape=None, filter_shape=word_filter_shape, pool_size=word_pool_size, activation=Tanh, k=k_max_word, W=dropout_word_conv.W, b=dropout_word_conv.b) sent_vec = word_conv.output.reshape( (x.shape[0], 1, x.shape[1], sent_vec_dim)) sent_vec = sent_vec * (1 - drop_rate_sentence) # construct doc level context information sent_filter_shape = (num_maps_sentence, 1, sentence_window, sent_vec_dim) sent_pool_size = (sentence_num - sentence_window + 1, 1) dropout_sent_conv = nn.ConvPoolLayer(rng, input=dropout_sent_vec, input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence) sent_conv = nn.ConvPoolLayer(rng, input=sent_vec, input_shape=None, filter_shape=sent_filter_shape, pool_size=sent_pool_size, activation=Tanh, k=k_max_sentence, W=dropout_sent_conv.W, b=dropout_sent_conv.b) # reshape the sentence vec dropout_sent_vec = dropout_sent_vec.reshape( (x.shape[0], x.shape[1], sent_vec_dim)) sent_vec = sent_vec.reshape((x.shape[0], x.shape[1], sent_vec_dim)) dropout_doc_vec = dropout_sent_conv.output.flatten(2) doc_vec = sent_conv.output.flatten(2) doc_vec_dim = num_maps_sentence * k_max_sentence # concatenate the doc vec along with the sentence vector con_dropout_sent_vec = T.concatenate([ dropout_sent_vec, T.tile(dropout_doc_vec, [1, x.shape[1]]).reshape( (x.shape[0], x.shape[1], doc_vec_dim)) ], axis=2).reshape( (x.shape[0] * x.shape[1], sent_vec_dim + doc_vec_dim)) con_sent_vec = T.concatenate([ sent_vec, T.tile(doc_vec, [1, x.shape[1]]).reshape( (x.shape[0], x.shape[1], doc_vec_dim)) ], axis=2).reshape( (x.shape[0] * x.shape[1], sent_vec_dim + doc_vec_dim)) # construct sentence level classifier n_in = sent_vec_dim + doc_vec_dim n_out = 1 sen_W_values = np.zeros((n_in, n_out), dtype=theano.config.floatX) sen_W = theano.shared(value=sen_W_values, borrow=True, name="logis_W") sen_b_value = nn.as_floatX(0.0) sen_b = theano.shared(value=sen_b_value, borrow=True, name="logis_b") drop_sent_prob = T.nnet.sigmoid( T.dot(con_dropout_sent_vec, sen_W) + sen_b) sent_prob = T.nnet.sigmoid(T.dot(con_sent_vec, sen_W) + sen_b) # reform the sent vec to doc level drop_sent_prob = drop_sent_prob.reshape((x.shape[0], x.shape[1])) sent_prob = sent_prob.reshape((x.shape[0], x.shape[1])) # using the dynamic top k max probability as bag level probability # compute the dynamic K for each documents drop_doc_prob = T.sum(T.sort(drop_sent_prob, axis=1) * sen_k, axis=1) / T.sum(sen_k, axis=1) doc_prob = T.sum(T.sort(sent_prob, axis=1) * sen_k, axis=1) / T.sum( sen_k, axis=1) drop_doc_prob = T.clip(drop_doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7)) doc_prob = T.clip(doc_prob, nn.as_floatX(1e-7), nn.as_floatX(1 - 1e-7)) doc_preds = doc_prob > 0.5 # instance level cost drop_sent_cost = T.sum( T.maximum( 0.0, nn.as_floatX(.5) - T.sgn( drop_sent_prob.reshape((x.shape[0] * x.shape[1], n_out)) - nn.as_floatX(0.6)) * T.dot(con_dropout_sent_vec, sen_W)) * sen_flags.reshape( (x.shape[0] * x.shape[1], n_out))) / T.sum(sen_flags) # we need that the most positive instance at least 0.7 in pos bags # and at most 0.1 in neg bags # we want the number of positive instance should at least ... # and non of the positive instances in the negative bags # compute the number of positive instance positive_count = T.sum((drop_sent_prob * sen_flags) > 0.5, axis=1) pos_cost = T.maximum(nn.as_floatX(0.0), positive_count - T.sum(sen_k, axis=1)) neg_cost = T.maximum(nn.as_floatX(0.0), positive_count) penal_cost = T.mean(pos_cost * y + neg_cost * (nn.as_floatX(1.0) - y)) # add the sentence similarity constrains sen_sen = T.dot(con_dropout_sent_vec, con_dropout_sent_vec.T) sen_sqr = T.sum(con_dropout_sent_vec**2, axis=1) sen_sqr_left = sen_sqr.dimshuffle(0, 'x') sen_sqr_right = sen_sqr.dimshuffle('x', 0) sen_sim_matrix = sen_sqr_left - 2 * sen_sen + sen_sqr_right sen_sim_matrix = T.exp(-1 * sen_sim_matrix) sen_sim_prob = drop_sent_prob.reshape( (x.shape[0] * x.shape[1], 1)) - drop_sent_prob.flatten() sen_sim_prob = sen_sim_prob**2 sen_sim_flag = T.dot(sen_flags.reshape((x.shape[0] * x.shape[1], 1)), sen_flags.reshape((1, x.shape[0] * x.shape[1]))) sen_sim_cost = T.sum( sen_sim_matrix * sen_sim_prob * sen_sim_flag) / T.sum(sen_sim_flag) # bag level cost drop_bag_cost = T.mean(-y * T.log(drop_doc_prob) * nn.as_floatX(0.6) - (1 - y) * T.log(1 - drop_doc_prob) * nn.as_floatX(0.4)) drop_cost = drop_bag_cost * nn.as_floatX(0.6) + \ drop_sent_cost * nn.as_floatX(0.1) + \ penal_cost * nn.as_floatX(0.5) + \ sen_sim_cost * nn.as_floatX(0.0001) # collect parameters self.params.append(words) self.params += dropout_word_conv.params self.params += dropout_sent_conv.params self.params.append(sen_W) self.params.append(sen_b) grad_updates = nn.sgd_updates_adadelta(self.params, drop_cost, rho, epsilon, norm_lim) # construct the dataset # random the train_x, train_y = nn.shared_dataset(dataset[0]) test_x, test_y = nn.shared_dataset(dataset[1]) test_cpu_y = dataset[1][1] n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) # construt the model index = T.iscalar() train_func = theano.function( [index], [ drop_cost, drop_bag_cost, drop_sent_cost, penal_cost, sen_sim_cost ], updates=grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size], sen_flags: train_flags[index * batch_size:(index + 1) * batch_size], sen_k: train_k[index * batch_size:(index + 1) * batch_size] }) test_func = theano.function( [index], doc_preds, givens={ x: test_x[index * batch_size:(index + 1) * batch_size], sen_k: test_k[index * batch_size:(index + 1) * batch_size] }) get_train_sent_prob = theano.function( [index], sent_prob, givens={x: train_x[index * batch_size:(index + 1) * batch_size]}) get_test_sent_prob = theano.function( [index], sent_prob, givens={x: test_x[index * batch_size:(index + 1) * batch_size]}) epoch = 0 best_score = 0 log_file = open("./log/%s.log" % exp_name, 'w') while epoch <= max_iteration: start_time = timeit.default_timer() epoch += 1 costs = [] for mini_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(mini_index) costs.append(cost_epoch) set_zero(zero_vec) total_train_cost, train_bag_cost, train_sent_cost, train_penal_cost, train_sim_cost = zip( *costs) print "Iteration %d, total_cost %f bag_cost %f sent_cost %f penal_cost %f sim cost %f\n" % ( epoch, np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost), np.mean(train_sim_cost)) if epoch % 1 == 0: test_preds = [] for i in xrange(n_test_batches): test_y_pred = test_func(i) test_preds.append(test_y_pred) test_preds = np.concatenate(test_preds) test_score = 1 - np.mean(np.not_equal(test_cpu_y, test_preds)) precision, recall, beta, support = precision_recall_fscore_support( test_cpu_y, test_preds, pos_label=1) if beta[1] > best_score or epoch % 5 == 0: best_score = beta[1] # save the sentence vectors train_sens = [ get_train_sent_prob(i) for i in range(n_train_batches) ] test_sens = [ get_test_sent_prob(i) for i in range(n_test_batches) ] train_sens = np.concatenate(train_sens, axis=0) test_sens = np.concatenate(test_sens, axis=0) out_train_sent_file = "./results/%s_train_sent_%d.vec" % ( exp_name, epoch) out_test_sent_file = "./results/%s_test_sent_%d.vec" % ( exp_name, epoch) with open(out_test_sent_file, 'w') as test_f, open(out_train_sent_file, 'w') as train_f: cPickle.dump(train_sens, train_f) cPickle.dump(test_sens, test_f) print "Get best performace at %d iteration %f" % ( epoch, test_score) log_file.write( "Get best performance at %d iteration %f \n" % (epoch, test_score)) end_time = timeit.default_timer() print "Iteration %d , precision, recall, f1" % epoch, precision, recall, beta log_file.write( "Iteration %d, neg precision %f, pos precision %f, neg recall %f pos recall %f , neg f1 %f, pos f1 %f, total_cost %f bag_cost %f sent_cost %f penal_cost %f\n" % (epoch, precision[0], precision[1], recall[0], recall[1], beta[0], beta[1], np.mean(total_train_cost), np.mean(train_bag_cost), np.mean(train_sent_cost), np.mean(train_penal_cost))) print "Using time %f m" % ((end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) end_time = timeit.default_timer() print "Iteration %d Using time %f m" % (epoch, (end_time - start_time) / 60.) log_file.write("Uing time %f m\n" % ((end_time - start_time) / 60.)) log_file.flush() log_file.close()
def adamax(loss_or_grads=None, params=None, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8): """Adamax updates Adamax updates implemented as in [1]_. This is a variant of the Adam algorithm based on the infinity norm. Parameters ---------- loss_or_grads: symbolic expression or list of expressions A scalar loss expression, or a list of gradient expressions params: list of shared variables The variables to generate update expressions for learning_rate: float Learning rate beta1: float Exponential decay rate for the first moment estimates. beta2: float Exponential decay rate for the weighted infinity norm estimates. epsilon: float Constant for numerical stability. Returns ------- OrderedDict A dictionary mapping each parameter to its update expression Notes ----- Optimizer can be called without both loss_or_grads and params in that case partial function is returned References ---------- .. [1] Kingma, Diederik, and Jimmy Ba (2014): Adam: A Method for Stochastic Optimization. arXiv preprint arXiv:1412.6980. Examples -------- >>> a = theano.shared(1.) >>> b = a*2 >>> updates = adamax(b, [a], learning_rate=.01) >>> isinstance(updates, dict) True >>> optimizer = adamax(learning_rate=.01) >>> callable(optimizer) True >>> updates = optimizer(b, [a]) >>> isinstance(updates, dict) True """ if loss_or_grads is None and params is None: return partial(adamax, **_get_call_kwargs(locals())) elif loss_or_grads is None or params is None: raise ValueError( "Please provide both `loss_or_grads` and `params` to get updates") all_grads = get_or_compute_grads(loss_or_grads, params) t_prev = theano.shared(pm.theanof.floatX(0.0)) updates = OrderedDict() # Using theano constant to prevent upcasting of float32 one = tt.constant(1) t = t_prev + 1 a_t = learning_rate / (one - beta1**t) for param, g_t in zip(params, all_grads): value = param.get_value(borrow=True) m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) u_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) m_t = beta1 * m_prev + (one - beta1) * g_t u_t = tt.maximum(beta2 * u_prev, abs(g_t)) step = a_t * m_t / (u_t + epsilon) updates[m_prev] = m_t updates[u_prev] = u_t updates[param] = param - step updates[t_prev] = t return updates
def relu(self,X): return T.maximum(X, 0)
def __init__(self, numpy_rng=None, theano_rng=None, n_h=99, n_s=99, n_v=100, init_from=None, sparse_hmask=None, neg_sample_steps=1, lr_spec=None, lr_timestamp=None, lr_mults={}, iscales={}, clip_min={}, clip_max={}, truncation_bound={}, l1={}, l2={}, sp_weight={}, sp_targ={}, batch_size=13, compile=True, debug=False, seed=1241234, my_save_path=None, save_at=None, save_every=None, flags={}, max_updates=5e5): """ :param n_h: number of h-hidden units :param n_v: number of visible units :param iscales: optional dictionary containing initialization scale for each parameter :param neg_sample_steps: number of sampling updates to perform in negative phase. :param l1: hyper-parameter controlling amount of L1 regularization :param l2: hyper-parameter controlling amount of L2 regularization :param batch_size: size of positive and negative phase minibatch :param compile: compile sampling and learning functions :param seed: seed used to initialize numpy and theano RNGs. """ Model.__init__(self) Block.__init__(self) assert lr_spec is not None for k in ['h']: assert k in sp_weight.keys() for k in ['h']: assert k in sp_targ.keys() self.validate_flags(flags) self.jobman_channel = None self.jobman_state = {} self.register_names_to_del(['jobman_channel']) ### make sure all parameters are floatX ### for (k, v) in l1.iteritems(): l1[k] = npy_floatX(v) for (k, v) in l2.iteritems(): l2[k] = npy_floatX(v) for (k, v) in sp_weight.iteritems(): sp_weight[k] = npy_floatX(v) for (k, v) in sp_targ.iteritems(): sp_targ[k] = npy_floatX(v) for (k, v) in clip_min.iteritems(): clip_min[k] = npy_floatX(v) for (k, v) in clip_max.iteritems(): clip_max[k] = npy_floatX(v) # dump initialization parameters to object for (k, v) in locals().iteritems(): if k != 'self': setattr(self, k, v) # allocate random number generators self.rng = numpy.random.RandomState( seed) if numpy_rng is None else numpy_rng self.theano_rng = RandomStreams(self.rng.randint( 2**30)) if theano_rng is None else theano_rng ############### ALLOCATE PARAMETERS ################# # allocate symbolic variable for input self.input = T.matrix('input') self.init_parameters() self.init_chains() # learning rate, with deferred 1./t annealing self.iter = sharedX(0.0, name='iter') if lr_spec['type'] == 'anneal': num = lr_spec['init'] * lr_spec['start'] denum = T.maximum(lr_spec['start'], lr_spec['slope'] * self.iter) self.lr = T.maximum(lr_spec['floor'], num / denum) elif lr_spec['type'] == 'linear': lr_start = npy_floatX(lr_spec['start']) lr_end = npy_floatX(lr_spec['end']) self.lr = lr_start + self.iter * (lr_end - lr_start) / npy_floatX( self.max_updates) else: raise ValueError('Incorrect value for lr_spec[type]') # configure input-space (new pylearn2 feature?) self.input_space = VectorSpace(n_v) self.output_space = VectorSpace(n_h) self.batches_seen = 0 # incremented on every batch self.examples_seen = 0 # incremented on every training example self.force_batch_size = batch_size # force minibatch size self.error_record = [] if compile: self.do_theano() #### load layer 1 parameters from file #### if init_from: self.load_params(init_from)
def dprime_loss(outXPos, outYPos, outXNeg, outYNeg, margin=5, alpha=0.5): norm2Pos = norm2(outXPos - outYPos) norm2Neg = norm2(outXNeg - outYNeg) return norm2Pos.std() + norm2Neg.std() + norm2Pos.mean() + T.maximum( 0.0, margin - norm2Neg.mean())
def siamese_loss(outXPos, outYPos, outXNeg, outYNeg, margin=5, alpha=0.5): lossData = (1 - alpha) * norm2Sqr(outXPos - outYPos) + alpha * T.sqr( T.maximum(0.0, margin - norm2(outXNeg - outYNeg))) return lossData.mean()
def run(binary=False, noise=None, nalpha=0, result_path=None): # BN parameters batch_size = 128 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # Training parameters num_epochs = 150 print("num_epochs = " + str(num_epochs)) # Dropout parameters dropout_in = .2 # default: .2 print("dropout_in = " + str(dropout_in)) dropout_hidden = .5 # default: .5 print("dropout_hidden = " + str(dropout_hidden)) # BinaryOut if binary: activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") else: activation = lasagne.nonlinearities.tanh print("activation = lasagne.nonlinearities.tanh") # BinaryConnect print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR LR_start = 0.005 print("LR_start = " + str(LR_start)) LR_fin = 0.0000005 # 0.0000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... train_set_size = 40000 shuffle_parts = 1 print("shuffle_parts = " + str(shuffle_parts)) print("noise = " + str(noise)) print("nalpha = " + str(nalpha)) print('Loading CIFAR-10 dataset...') cifar = CifarReader("./data/cifar-10-batches-py/") train_X, train_y = cifar.get_train_data(n_samples=train_set_size, noise=noise, alpha=nalpha) valid_X, valid_y = cifar.get_validation_data() test_X, test_y = cifar.get_test_data() print("train_set_size = " + str(train_y.shape[0])) print("validation_set_size = " + str(valid_y.shape[0])) print("test_set_size = " + str(test_y.shape[0])) # Log output with open(result_path + "params.txt", "a+") as l: print("batch_size = " + str(batch_size), file=l) print("alpha = " + str(alpha), file=l) print("epsilon = " + str(epsilon), file=l) print("num_epochs = " + str(num_epochs), file=l) print("dropout_in = " + str(dropout_in), file=l) print("dropout_hidden = " + str(dropout_hidden), file=l) if binary: print("activation = binary_net.binary_tanh_unit", file=l) else: print("activation = lasagne.nonlinearities.tanh", file=l) print("binary = " + str(binary), file=l) print("stochastic = " + str(stochastic), file=l) print("H = " + str(H), file=l) print("W_LR_scale = " + str(W_LR_scale), file=l) print("LR_start = " + str(LR_start), file=l) print("LR_fin = " + str(LR_fin), file=l) print("LR_decay = " + str(LR_decay), file=l) print("shuffle_parts = " + str(shuffle_parts), file=l) print("noise = " + str(noise), file=l) print("nalpha = " + str(nalpha), file=l) print("train_set_size = " + str(train_y.shape[0]), file=l) print("validation_set_size = " + str(valid_y.shape[0]), file=l) print("test_set_size = " + str(test_y.shape[0]), file=l) # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_X = np.reshape(np.subtract(np.multiply(2. / 255., train_X), 1.), (-1, 3, 32, 32)) valid_X = np.reshape(np.subtract(np.multiply(2. / 255., valid_X), 1.), (-1, 3, 32, 32)) test_X = np.reshape(np.subtract(np.multiply(2. / 255., test_X), 1.), (-1, 3, 32, 32)) # flatten targets train_y = np.hstack(train_y) valid_y = np.hstack(valid_y) test_y = np.hstack(test_y) # Onehot the targets train_y = np.float32(np.eye(10)[train_y]) valid_y = np.float32(np.eye(10)[valid_y]) test_y = np.float32(np.eye(10)[test_y]) # for hinge loss train_y = 2 * train_y - 1. valid_y = 2 * valid_y - 1. test_y = 2 * test_y - 1. print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input) cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_in) # 32C3-64C3-P2 cnn = binary_net.Conv2DLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, num_filters=32, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = binary_net.Conv2DLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, num_filters=64, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden) # 128C3-256C3-P2 cnn = binary_net.Conv2DLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, num_filters=128, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = binary_net.Conv2DLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, num_filters=256, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden) # 512FP-10FP cnn = binary_net.DenseLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=512) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden) cnn = binary_net.DenseLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=10) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.softmax) train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if binary: # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates.update( lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)) else: params = lasagne.layers.get_all_params(cnn, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(cnn, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') binary_net.train(train_fn, val_fn, cnn, batch_size, LR_start, LR_decay, num_epochs, train_X, train_y, valid_X, valid_y, test_X, test_y, shuffle_parts=shuffle_parts, result_path=result_path)
def ReLU(z): return T.maximum(0.0, z) from theano.tensor.nnet import sigmoid
def ReLU(z): return T.maximum(0.0, z)
def f(q_i, D_gt_id, tparams, is_train, trng, options): # Use search engine again to compute the reward/metrics given a query. search = Search(options) # append the unknown vector for words whose index = -1. W_ = tensor.concatenate([tparams['W'], tparams['UNK']], axis=0) q_m = (q_i > -2).astype('float32') #get embeddings for the queries q_a = W_[q_i.flatten()].reshape( (q_i.shape[0], q_i.shape[1], prm.dim_emb)) * q_m[:, :, None] if len(prm.filters_query) > 0: q_aa = conv_query(q_a, tparams) else: q_aa = q_a q_a_avg = q_a.sum(1) / tensor.maximum(1., q_m.sum(1, keepdims=True)) out = [] for n_iter in range(prm.n_iterations): if n_iter == 0 and prm.q_0_fixed_until >= prm.n_iterations: prob = tensor.zeros((q_a.shape[0], prm.max_words_input, 2)) bl = tensor.zeros((q_a.shape[0], )) D_m_r = tensor.zeros((q_a.shape[0], prm.max_words_input)) else: if n_iter > 0: D_m_ = (D_i_ > -2).astype('float32') D_a_ = W_[D_i_.flatten()].reshape( (D_i_.shape[0], D_i_.shape[1], D_i_.shape[2], prm.dim_emb)) * D_m_[:, :, :, None] else: D_a_ = 1. * q_a[:, None, :, :] D_m_ = 1. * q_m[:, None, :] if len(prm.filters_cand) > 0: D_aa_ = conv_cand(D_a_, tparams, 0) else: D_aa_ = D_a_ D_aa_ = tensor.dot(D_aa_, tparams['Ad']) + tparams['bAd'] if n_iter > 0: if prm.q_0_fixed_until < 2: D_a = tensor.concatenate([D_a, D_a_], axis=1) D_aa = tensor.concatenate([D_aa, D_aa_], axis=1) D_m = tensor.concatenate([D_m, D_m_], axis=1) else: D_a = D_a_ D_aa = D_aa_ D_m = D_m_ else: D_a = D_a_ D_aa = D_aa_ D_m = D_m_ D_a_r = D_a.reshape((D_a.shape[0], -1, D_a.shape[3])) D_aa_r = D_aa.reshape((D_aa.shape[0], -1, D_aa.shape[3])) D_m_r = D_m.reshape((D_m.shape[0], -1)) q_aa_avg = q_aa.sum(1) / tensor.maximum(1., q_m.sum(1, keepdims=True)) q_aa_att = q_aa_avg[:, None, :] q_aa_att = tensor.dot(q_aa_att, tparams['Aq']) z = D_aa_r + q_aa_att # estimate reward based on the query. bl = theano.gradient.grad_scale(z, 0.1) D_m_r_c = theano.gradient.disconnected_grad(D_m_r) bl = bl.sum(1) / tensor.maximum(1., D_m_r_c.sum(1))[:, None] for i in range(len(prm.n_hidden_critic) + 1): if prm.dropout > 0: bl = dropout_layer(bl, is_train, trng) bl = tensor.maximum(0., bl) bl = tensor.dot(bl, tparams['C' + str(i)]) + tparams['bC' + str(i)] bl = tensor.tanh(bl) bl = bl.flatten() for i in range(len(prm.n_hidden_actor) + 1): if prm.dropout > 0: z = dropout_layer(z, is_train, trng) z = tensor.maximum(0., z) z = tensor.dot(z, tparams['V' + str(i)]) + tparams['bV' + str(i)] prob = softmax_mask(z) * D_m_r[:, :, None] # if training, sample. Otherwise, pick maximum probability. s = trng.multinomial(n=1, pvals=prob.reshape((-1, 2)), dtype=prob.dtype) s = s.reshape((prob.shape[0], prob.shape[1], prob.shape[2])) #if frozen is enabled and this iteration is within its limit, pick maximum probability. if prm.frozen_until > 0: if n_iter < prm.frozen_until: s = prob res = tensor.eq(is_train, 1.) * s + tensor.eq(is_train, 0.) * prob # final answer & valid words ans = res.argmax(2) * D_m_r if n_iter < prm.q_0_fixed_until: ones = tensor.ones((q_a.shape[0], prm.max_words_input)) if n_iter > 0: # select everything from the original query in the first iteration. ans = tensor.concatenate([ones, ans], axis=1) else: ans = ones metrics, D_i_, D_id_, D_gt_m_ = search(ans, D_gt_id, n_iter, is_train) out.append([prob, ans, metrics, bl, D_m_r, D_id_]) return out
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train( para_len_limit, q_len_limit) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_label_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test( word2id, para_len_limit, q_len_limit) test_size = len(test_para_list) if test_size != len(test_Q_list) or test_size != len( test_mask) or test_size != len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values = random_value_normal((overall_vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extraF = T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' norm_extraF = normalize_matrix(extraF) U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size) paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b] UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size) Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] W_a1 = create_ensemble_para( rng, hidden_size, hidden_size) # init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size + 3) # 3 extra features LR_b = theano.shared( value=numpy.zeros((2, ), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras = [W_a1, W_a2, U_a, LR_b] params = [embeddings] + paragraph_para + Q_para + attention_paras load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params) paragraph_input = embeddings[paragraph.flatten()].reshape( (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose( (0, 2, 1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input = T.concatenate( [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1) paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size, U=U1, W=W1, b=b1, Ub=U1_b, Wb=W1_b, bb=b1_b) para_reps = paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape( (questions.shape[0], questions.shape[1], emb_size)).transpose( (0, 2, 1)) #(#questions, emb_size, maxsenlength) questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ, W=WQ, b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) questions_reps_tensor = questions_model.output_tensor #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor #use CNN for question modeling # Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen) # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5)) # Q_conv_para=[conv_W, conv_b] # conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4, # image_shape=(batch_size, 1, emb_size, q_len_limit), # filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b) # conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1) # gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1)) # masked_conv_output=conv_output*gru_mask # questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size)) # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix = para_matrix.T interaction_matrix = T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix = T.nnet.softmax(interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len) batch_q_reps, updates = theano.scan( fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor ]) #batch_q_reps (batch, hidden, para_len) #attention distributions norm_W_a1 = normalize_matrix(W_a1) norm_W_a2 = normalize_matrix(W_a2) norm_U_a = normalize_matrix(U_a) transformed_para_reps = T.maximum( T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0) #relu transformed_q_reps = T.maximum( T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both = transformed_para_reps + transformed_q_reps # U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size) # U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size) # accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] # accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b) # accu_both=accumu_model.output_tensor.transpose((0,2,1)) prior_att = T.concatenate([add_both, norm_extraF], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices = para_mask.flatten().nonzero()[0] layer3 = LogisticRegression(rng, input=prior_att.reshape( (batch_size * prior_att.shape[1], hidden_size + 3)), n_in=hidden_size + 3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.sum( T.log(layer3.p_y_given_x) [valid_indices, labels.flatten()[valid_indices]]) #[T.arange(y.shape[0]), y]) distributions = layer3.p_y_given_x[:, -1].reshape( (batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) # masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask masked_dis = distributions * para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] L2_reg = L2norm_paraList( [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost = error #+ConvGRU_1.error# accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [paragraph, questions, labels, para_mask, q_mask, extraF], cost, updates=updates, on_unused_input='ignore') test_model = theano.function( [paragraph, questions, para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size # remain_test=test_size%batch_size test_batch_start = list( numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i += train_model( np.asarray([ train_para_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_Q_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_label_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_para_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_feature_matrixlist[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX)) #print iter if iter % 10 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() exact_match = 0.0 F1_match = 0.0 q_amount = 0 for test_para_id in test_batch_start: distribution_matrix = test_model( np.asarray(test_para_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray( test_feature_matrixlist[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list = test_text_list[ test_para_id:test_para_id + batch_size] para_gold_ansset_list = q_ansSet_list[ test_para_id:test_para_id + batch_size] paralist_extra_features = test_feature_matrixlist[ test_para_id:test_para_id + batch_size] sub_para_mask = test_para_mask[test_para_id:test_para_id + batch_size] para_len = len(test_para_wordlist_list[0]) if para_len != len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len( distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount += batch_size # print q_size # print test_para_word_list Q_list_inword = test_Q_list_word[ test_para_id:test_para_id + batch_size] for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans = extract_ansList_attentionList( test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) q_gold_ans_set = para_gold_ansset_list[q] # print test_para_wordlist_list[q] # print Q_list_inword[q] # print pred_ans.encode('utf8'), q_gold_ans_set if pred_ans in q_gold_ans_set: exact_match += 1 F1 = MacroF1(pred_ans, q_gold_ans_set) F1_match += F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc = F1_match / q_amount exact_acc = exact_match / q_amount if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc if max_exact_acc > max_EM: store_model_to_file( rootPath + 'Best_Paras_conv_' + str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny))
def spatial_gradient(prediction, target, l=0.1,m=2.): # Flatten input to make calc easier pred = prediction pred_v = pred.flatten(2) target_v = target.flatten(2) # Compute mask mask = T.gt(target_v,0.) # Compute n of valid pixels n_valid = T.sum(mask, axis=1) # Apply mask and log transform m_pred = pred_v * mask m_t = T.switch(mask, T.log(target_v),0.) d = m_pred - m_t # Define scale invariant cost scale_invariant_cost = (T.sum(n_valid * T.sum(d**2, axis=1)) - l*T.sum(T.sum(d, axis=1)**2))/ T.maximum(T.sum(n_valid**2), 1) # Add spatial gradient components from D. Eigen DNL # Squeeze in case if pred.ndim == 4: pred = pred[:,0,:,:] if target.ndim == 4: target = target[:,0,:,:] # Mask in tensor form mask_tensor = T.gt(target,0.) # Project into log space target = T.switch(mask_tensor, T.log(target),0.) # Stepsize h = 1 # Compute spatial gradients symbolically p_di = (pred[:,h:,:] - pred[:,:-h,:]) * (1 / np.float32(h)) p_dj = (pred[:,:,h:] - pred[:,:,:-h]) * (1 / np.float32(h)) t_di = (target[:,h:,:] - target[:,:-h,:]) * (1 / np.float32(h)) t_dj = (target[:,:,h:] - target[:,:,:-h]) * (1 / np.float32(h)) m_di = T.and_(mask_tensor[:,h:,:], mask_tensor[:,:-h,:]) m_dj = T.and_(mask_tensor[:,:,h:], mask_tensor[:,:,:-h]) # Define spatial grad cost grad_cost = T.sum(m_di * (p_di - t_di)**2) / T.sum(m_di) + T.sum(m_dj * (p_dj - t_dj)**2) / T.sum(m_dj) # Compute final expression return scale_invariant_cost + grad_cost
def scale_invariant_error(predictions, targets): """ Scale invariant error in log space :param predictions: Prediction tensor :param targets: Target tensor :return: theano expression """ _lambda_ = 0.5 # Flatten input to make calc easier pred = predictions.flatten(2) target = targets.flatten(2) # Compute mask mask = T.gt(target, 0) # Compute n of valid pixels n_valid = T.sum(mask, axis=1) # Apply mask and log transform m_pred = pred * mask m_t = T.switch(mask, T.log(target), 0) d = m_pred - m_t # Define cost return (T.sum(n_valid * T.sum(d ** 2, axis=1)) - _lambda_ * T.sum(T.sum(d, axis=1) ** 2)) / T.maximum( T.sum(n_valid ** 2), 1)
def relu(self, x): return T.maximum(x, 0)
def get_output_for(self, input, training=False, **kwargs): if training: R = (T.max(input, axis=1) - T.min(input, axis=1)).dimshuffle( 0, 'x') input = self.temp * input / T.maximum(R, 0.1) return T.exp(input) / T.sum(T.exp(input), axis=1).dimshuffle(0, 'x')
def rectifier(x): return tensor.maximum(0., x)
def out_shape(imgshape, ds, ignore_border=False, st=None, padding=(0, 0)): """Return the shape of the output from this op, for input of given shape and flags. Parameters ---------- imgshape : tuple of integers or scalar Theano variables the shape of a tensor of images. The last two elements are interpreted as the number of rows, and the number of cols. ds : tuple of two ints downsample factor over rows and columns this parameter indicates the size of the pooling region st : tuple of two ints the stride size. This is the distance between the pooling regions. If it's set to None, in which case it equlas ds. ignore_border : bool if ds doesn't divide imgshape, do we include an extra row/col of partial downsampling (False) or ignore it (True). padding : tuple of two ints (pad_h, pad_w), pad zeros to extend beyond four borders of the images, pad_h is the size of the top and bottom margins, and pad_w is the size of the left and right margins. Returns ------- list : the shape of the output from this op, for input of given shape. This will have the same length as imgshape, but with last two elements reduced as per the downsampling & ignore_border flags. """ if len(imgshape) < 2: raise TypeError('imgshape must have at least two elements ' '(rows, cols)') if st is None: st = ds r, c = imgshape[-2:] r += padding[0] * 2 c += padding[1] * 2 if ignore_border: out_r = (r - ds[0]) // st[0] + 1 out_c = (c - ds[1]) // st[1] + 1 if isinstance(r, theano.Variable): nr = tensor.maximum(out_r, 0) else: nr = numpy.maximum(out_r, 0) if isinstance(c, theano.Variable): nc = tensor.maximum(out_c, 0) else: nc = numpy.maximum(out_c, 0) else: if isinstance(r, theano.Variable): nr = tensor.switch( tensor.ge(st[0], ds[0]), (r - 1) // st[0] + 1, tensor.maximum(0, (r - 1 - ds[0]) // st[0] + 1) + 1) elif st[0] >= ds[0]: nr = (r - 1) // st[0] + 1 else: nr = max(0, (r - 1 - ds[0]) // st[0] + 1) + 1 if isinstance(c, theano.Variable): nc = tensor.switch( tensor.ge(st[1], ds[1]), (c - 1) // st[1] + 1, tensor.maximum(0, (c - 1 - ds[1]) // st[1] + 1) + 1) elif st[1] >= ds[1]: nc = (c - 1) // st[1] + 1 else: nc = max(0, (c - 1 - ds[1]) // st[1] + 1) + 1 rval = list(imgshape[:-2]) + [nr, nc] return rval
def build_model(new_model=True): momentum_epsilon = 0.9 block_size = 64 nblocks = [10, 10] rate = [.16, .16] L2reg = 0.001 is_uniform_policy = True lambda_b = [40, 20] lambda_v = [20, 20] learning_rates = [0.01, 0.5] print locals() hyperparams = locals() if new_model: expid = str(uuid.uuid4()) import os import os.path code = file(os.path.abspath(__file__), 'r').read() os.mkdir(expid) os.chdir(expid) file('code.py', 'w').write(code) print expid f = file("params.txt", 'w') for i in hyperparams: f.write("%s:%s\n" % (i, str(hyperparams[i]))) f.close() params = [] reinforce_params = [] shared.bind(reinforce_params, "reinforce") shared.bind(params) rect = lambda x: T.maximum(0, x) act = T.tanh model = StackModel([ PolicyDropoutLayer(32 * 32 * 3, block_size * nblocks[0], block_size, act, rate[0]), PolicyDropoutLayer(block_size * nblocks[0], block_size * nblocks[1], block_size, act, rate[1]), InputSparseHiddenLayer(block_size * nblocks[1], 10, T.nnet.softmax, block_size=block_size) ]) x = T.matrix() y = T.ivector() lr = T.scalar() y_hat, = model(x) loss = T.nnet.categorical_crossentropy(y_hat, y) cost = T.sum(loss) l2 = lambda x: sum([T.sum(i**2) for i in x]) updates = [] all_probs = [] for i in []: #range(len(model.layers)-1): probs = model.layers[i].probs sample_probs = model.layers[i].sample_probs layer_params = [model.layers[i].d.W, model.layers[i].d.b] all_probs.append(probs) l2_batchwise = lambda_b[i] * T.sum( abs(T.mean(probs, axis=0) - rate[i])**2) l2_exawise = lambda_b[i] * 0.001 * T.sum( abs(T.mean(probs, axis=1) - rate[i])**2) batch_var = lambda_v[i] * T.sum(T.var(probs, axis=0)) batch_var += lambda_v[i] * 0.1 * T.sum(T.var(probs, axis=1)) regularising_cost = l2_batchwise + l2_exawise - batch_var + L2reg * l2( layer_params) updates += reinforce_no_baseline( layer_params, sample_probs, loss - loss.min(), # momentum_epsilon, lr * learning_rates[i], regularising_cost) error = T.sum(T.neq(y_hat.argmax(axis=1), y)) nn_regularization = L2reg * l2(params) grads = T.grad(cost + nn_regularization, params) updates += gradient_descent(params, grads, lr) print params, reinforce_params learn = theano.function([x, y, lr], [cost, error], updates=updates, allow_input_downcast=True) test = theano.function([x, y], [cost, error], allow_input_downcast=True) return model, learn, test