def construct_network(context, characters, hidden, mult_hidden): print "Setting up memory..." X = T.bvector('X') Y = T.bvector('Y') alpha = T.cast(T.fscalar('alpha'), dtype=theano.config.floatX) lr = T.cast(T.fscalar('lr'), dtype=theano.config.floatX) print "Initialising weights..." W_char_hidden = U.create_shared(U.initial_weights(characters, hidden)) f_char_hidden = U.create_shared(U.initial_weights(characters, mult_hidden)) b_hidden = U.create_shared(U.initial_weights(hidden)) Wf_hidden = U.create_shared(U.initial_weights(hidden, mult_hidden)) fW_hidden = U.create_shared(U.initial_weights(mult_hidden, hidden)) W_hidden_predict = U.create_shared(U.initial_weights(hidden, characters)) b_predict = U.create_shared(U.initial_weights(characters)) print "Constructing graph..." hidden = make_hidden(hidden, W_char_hidden[X], f_char_hidden[X], Wf_hidden, fW_hidden, b_hidden) predictions = T.nnet.softmax(T.dot(hidden, W_hidden_predict) + b_predict) weights = [ W_char_hidden, f_char_hidden, b_hidden, Wf_hidden, fW_hidden, W_hidden_predict, b_predict ] cost = -T.mean(T.log(predictions)[T.arange(Y.shape[0]), Y]) gparams = T.grad(cost, weights) deltas = [U.create_shared(np.zeros(w.get_value().shape)) for w in weights] updates = [(param, param - (alpha * delta + gparam * lr)) for param, delta, gparam in zip(weights, deltas, gparams) ] + [(delta, alpha * delta + gparam * lr) for delta, gparam in zip(deltas, gparams)] return X, Y, alpha, lr, updates, predictions, weights
def __init__(self, ne, de, cs, nh, nc, L2_reg = 0.0, rng = np.random.RandomState()): self.nc = nc self.hiddenLayer = Layer(de*cs, nh, rng = rng) self.outputLayer = Layer(nh, nc) self.emb = theano.shared(rng.normal(loc = 0.0, scale = 0.01, size = (ne, de)).astype(theano.config.floatX)) A = rng.normal(loc = 0.0, scale = 0.01, size = (nc, nc)).astype(theano.config.floatX) self.A = theano.shared(value = A, name = 'A', borrow = True) self.params = self.hiddenLayer.params + self.outputLayer.params + [self.emb, self.A] self.names = ['Wh', 'bh', 'w', 'b', 'emb', 'A'] idxs = T.imatrix('idxs') x = self.emb[idxs].reshape((idxs.shape[0], de*cs)) y = T.bvector('y') ans = T.bvector('ans') INF = 1e9 result, updates1 = theano.scan(fn = self.one_step, sequences = x, outputs_info = [theano.shared(0.0), theano.shared(-INF), theano.shared(-INF), theano.shared(-INF), None, None, None, None]) self.decode = theano.function(inputs = [idxs], outputs = result, updates = updates1) score, updates2 = theano.scan(fn = self.two_step, sequences = [x, dict(input = y, taps = [-1, 0]), dict(input = ans, taps = [-1, 0])], outputs_info = theano.shared(0.0)) cost = score[-1] gradients = T.grad(cost, self.params) lr = T.scalar('lr') for p, g in zip(self.params, gradients): updates2[p] = p + lr * g self.fit = theano.function(inputs = [idxs, y, ans, lr], outputs = cost, updates = updates2) self.normalize = theano.function(inputs = [], updates = {self.emb: self.emb / T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0, 'x')})
def test_param_allow_downcast_int(self): a = tensor.wvector('a') # int16 b = tensor.bvector('b') # int8 c = tensor.bscalar('c') # int8 f = pfunc([ Param(a, allow_downcast=True), Param(b, allow_downcast=False), Param(c, allow_downcast=None) ], (a + b + c)) # Both values are in range. Since they're not ndarrays (but lists), # they will be converted, and their value checked. assert numpy.all(f([3], [6], 1) == 10) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'), 1) # Value too big for a, silently ignored assert numpy.all(f([2**20], numpy.ones(1, dtype='int8'), 1) == 2) # Value too big for b, raises TypeError self.assertRaises(TypeError, f, [3], [312], 1) # Value too big for c, raises TypeError self.assertRaises(TypeError, f, [3], [6], 806)
def test_param_allow_downcast_int(self): a = tensor.wvector("a") # int16 b = tensor.bvector("b") # int8 c = tensor.bscalar("c") # int8 f = pfunc( [ In(a, allow_downcast=True), In(b, allow_downcast=False), In(c, allow_downcast=None), ], (a + b + c), ) # Both values are in range. Since they're not ndarrays (but lists), # they will be converted, and their value checked. assert np.all(f([3], [6], 1) == 10) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) with pytest.raises(TypeError): f([3], np.array([6], dtype="int16"), 1) # Value too big for a, silently ignored assert np.all(f([2**20], np.ones(1, dtype="int8"), 1) == 2) # Value too big for b, raises TypeError with pytest.raises(TypeError): f([3], [312], 1) # Value too big for c, raises TypeError with pytest.raises(TypeError): f([3], [6], 806)
def test_allow_input_downcast_int(self): a = tensor.wvector("a") # int16 b = tensor.bvector("b") # int8 c = tensor.bscalar("c") # int8 f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True) # Value too big for a, b, or c, silently ignored assert f([2**20], [1], 0) == 1 assert f([3], [312], 0) == 59 assert f([3], [1], 806) == 42 g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False) # All values are in range. Since they're not ndarrays (but lists # or scalars), they will be converted, and their value checked. assert np.all(g([3], [6], 0) == 9) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) with pytest.raises(TypeError): g([3], np.array([6], dtype="int16"), 0) # Value too big for b, raises TypeError with pytest.raises(TypeError): g([3], [312], 0) h = pfunc([a, b, c], (a + b + c)) # Default: allow_input_downcast=None # Everything here should behave like with False assert np.all(h([3], [6], 0) == 9) with pytest.raises(TypeError): h([3], np.array([6], dtype="int16"), 0) with pytest.raises(TypeError): h([3], [312], 0)
def build_loss(self, env_spec, policy): obs = env_spec.observation_space.new_tensor_variable('obs', extra_dims=1) next_obs = env_spec.observation_space.new_tensor_variable('next_obs', extra_dims=1) act = env_spec.action_space.new_tensor_variable('act', extra_dims=1) ret = T.vector('disc_n_return') term = T.bvector('terminal') if self.prioritized_replay: isw = T.vector('importance_sample_weights') if self.double_dqn: next_a = policy.actions_sym(next_obs) next_q = policy.target_q_at_a_sym(next_obs, next_a) else: next_q = policy.target_max_q_sym(next_obs) disc_next_q = (self.discount ** self.reward_horizon) * next_q y = ret + (1 - term) * disc_next_q q = policy.q_at_a_sym(obs, act) d = y - q losses = 0.5 * d ** 2 if self.delta_clip is not None: # Huber loss: b = self.delta_clip * (abs(d) - self.delta_clip / 2) losses = T.switch(abs(d) <= self.delta_clip, losses, b) if self.prioritized_replay: losses = isw * losses loss = T.mean(losses) td_abs_errors = T.clip(abs(d), 0, self.delta_clip) input_list = [obs, next_obs, act, ret, term] if self.prioritized_replay: input_list.append(isw) return input_list, loss, td_abs_errors
def test_param_allow_downcast_int(self): a = tensor.wvector('a') # int16 b = tensor.bvector('b') # int8 c = tensor.bscalar('c') # int8 f = pfunc([Param(a, allow_downcast=True), Param(b, allow_downcast=False), Param(c, allow_downcast=None)], (a + b + c)) # Both values are in range. Since they're not ndarrays (but lists), # they will be converted, and their value checked. assert numpy.all(f([3], [6], 1) == 10) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'), 1) # Value too big for a, silently ignored assert numpy.all(f([2 ** 20], numpy.ones(1, dtype='int8'), 1) == 2) # Value too big for b, raises TypeError self.assertRaises(TypeError, f, [3], [312], 1) # Value too big for c, raises TypeError self.assertRaises(TypeError, f, [3], [6], 806)
def test_allow_input_downcast_int(self): a = tensor.wvector('a') # int16 b = tensor.bvector('b') # int8 c = tensor.bscalar('c') # int8 f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True) # Value too big for a, b, or c, silently ignored assert f([2 ** 20], [1], 0) == 1 assert f([3], [312], 0) == 59 assert f([3], [1], 806) == 42 g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False) # All values are in range. Since they're not ndarrays (but lists # or scalars), they will be converted, and their value checked. assert numpy.all(g([3], [6], 0) == 9) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) self.assertRaises(TypeError, g, [3], numpy.array([6], dtype='int16'), 0) # Value too big for b, raises TypeError self.assertRaises(TypeError, g, [3], [312], 0) h = pfunc([a, b, c], (a + b + c)) # Default: allow_input_downcast=None # Everything here should behave like with False assert numpy.all(h([3], [6], 0) == 9) self.assertRaises(TypeError, h, [3], numpy.array([6], dtype='int16'), 0) self.assertRaises(TypeError, h, [3], [312], 0)
def construct_network(context,characters,hidden,mult_hidden): print "Setting up memory..." X = T.bvector('X') Y = T.bvector('Y') alpha = T.cast(T.fscalar('alpha'),dtype=theano.config.floatX) lr = T.cast(T.fscalar('lr'), dtype=theano.config.floatX) print "Initialising weights..." W_char_hidden = U.create_shared(U.initial_weights(characters,hidden)) f_char_hidden = U.create_shared(U.initial_weights(characters,mult_hidden)) b_hidden = U.create_shared(U.initial_weights(hidden)) Wf_hidden = U.create_shared(U.initial_weights(hidden,mult_hidden)) fW_hidden = U.create_shared(U.initial_weights(mult_hidden,hidden)) W_hidden_predict = U.create_shared(U.initial_weights(hidden,characters)) b_predict = U.create_shared(U.initial_weights(characters)) print "Constructing graph..." hidden = make_hidden( hidden, W_char_hidden[X], f_char_hidden[X], Wf_hidden, fW_hidden, b_hidden ) predictions = T.nnet.softmax(T.dot(hidden,W_hidden_predict) + b_predict) weights = [ W_char_hidden, f_char_hidden, b_hidden, Wf_hidden, fW_hidden, W_hidden_predict, b_predict ] cost = -T.mean(T.log(predictions)[T.arange(Y.shape[0]),Y]) gparams = T.grad(cost,weights) deltas = [ U.create_shared(np.zeros(w.get_value().shape)) for w in weights ] updates = [ ( param, param - ( alpha * delta + gparam * lr ) ) for param,delta,gparam in zip(weights,deltas,gparams) ] + [ ( delta, alpha * delta + gparam * lr) for delta,gparam in zip(deltas,gparams) ] return X,Y,alpha,lr,updates,predictions,weights
def __init__(self, state_format, actions_number, gamma=0.99, learning_rate=0.00025, ddqn=False, **kwargs): self.inputs = dict() self.learning_rate = learning_rate architecture = kwargs self.loss_history = [] self.misc_state_included = (state_format["s_misc"] > 0) self.gamma = np.float64(gamma) self.inputs["S0"] = tensor.tensor4("S0") self.inputs["S1"] = tensor.tensor4("S1") self.inputs["A"] = tensor.ivector("Action") self.inputs["R"] = tensor.vector("Reward") self.inputs["Nonterminal"] = tensor.bvector("Nonterminal") if self.misc_state_included: self.inputs["S0_misc"] = tensor.matrix("S0_misc") self.inputs["S1_misc"] = tensor.matrix("S1_misc") self.misc_len = state_format["s_misc"] else: self.misc_len = None # save it for the evaluation reshape # TODO get rid of this? self.single_image_input_shape = (1, ) + tuple(state_format["s_img"]) architecture["img_input_shape"] = (None, ) + tuple( state_format["s_img"]) architecture["misc_len"] = self.misc_len architecture["output_size"] = actions_number if self.misc_state_included: self.network, input_layers, _ = self._initialize_network( img_input=self.inputs["S0"], misc_input=self.inputs["S0_misc"], **architecture) self.frozen_network, _, alternate_inputs = self._initialize_network( img_input=self.inputs["S1"], misc_input=self.inputs["S1_misc"], **architecture) else: self.network, input_layers, _ = self._initialize_network( img_input=self.inputs["S0"], **architecture) self.frozen_network, _, alternate_inputs = self._initialize_network( img_input=self.inputs["S1"], **architecture) self.alternate_input_mappings = {} for layer, input in zip(input_layers, alternate_inputs): self.alternate_input_mappings[layer] = input # print "Network initialized." self._compile(ddqn)
def __init__(self, param_dict): self.param_dict = param_dict self.training_batch_size = param_dict['training_batch_size'] nkerns = param_dict['nkerns'] recept_width = param_dict['recept_width'] pool_width = param_dict['pool_width'] stride = param_dict['stride'] dropout_prob = param_dict['dropout_prob'] weight_decay = param_dict['l2_reg'] activation = param_dict['activation'] weights_variance = param_dict['weights_variance'] n_channels = param_dict['n_channels'] n_timesteps = param_dict['n_timesteps'] n_fbins = param_dict['n_fbins'] global_pooling = param_dict['global_pooling'] rng = np.random.RandomState(23455) self.training_mode = T.iscalar('training_mode') self.x = T.tensor4('x') self.y = T.bvector('y') self.batch_size = theano.shared(self.training_batch_size) self.input = self.x.reshape((self.batch_size, 1, n_channels * n_fbins, n_timesteps)) self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride, self.training_mode, dropout_prob[0], activation, weights_variance, n_channels, n_timesteps, n_fbins, global_pooling) self.classifier = SoftmaxLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1], training_mode=self.training_mode, dropout_prob=dropout_prob[-1]) self.weights = self.feature_extractor.weights + self.classifier.weights # ---------------------- BACKPROP self.cost = self.classifier.cross_entropy_cost(self.y) self.cost = self.classifier.cross_entropy_cost(self.y) L2_sqr = sum((weight ** 2).sum() for weight in self.weights[::2]) self.grads = T.grad(self.cost + weight_decay * L2_sqr, self.weights) self.updates = self.adadelta_updates(self.grads, self.weights) # self.updates = self.nesterov_momentum(self.grads, self.weights) # --------------------- FUNCTIONS self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)], outputs=self.cost, updates=self.updates) self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)], self.cost) self.test_model = theano.function([self.x, Param(self.training_mode, default=0)], self.classifier.p_y_given_x[:, 1])
def __init__(self, nkerns, recept_width, pool_width, dropout_prob, training_batch_size, activation, n_timesteps=1000, dim=18): if activation == 'tanh': activation_function = lambda x: T.tanh(x) elif activation == 'relu': activation_function = lambda x: T.maximum(0.0, x) else: raise ValueError('unknown activation function') self.training_batch_size = training_batch_size rng = np.random.RandomState(23455) self.training_mode = T.iscalar('training_mode') self.x = T.matrix('x') self.y = T.bvector('y') self.batch_size = theano.shared(self.training_batch_size) # 18@1*1000 self.layer0_input = self.x.reshape((self.batch_size, dim, 1, n_timesteps)) # image 18 @ 1*1000 # c1: nkerns[0] @ 1* (1000 - recept_width[0] + 1) # s2: nkerns[0] @ 1 * c1 / pool_width[0] layer0 = ConvPoolLayer(rng, input=self.layer0_input, image_shape=(None, dim, 1, n_timesteps), filter_shape=(nkerns[0], dim, 1, recept_width[0]), poolsize=(1, pool_width[0]), activation_function=activation_function) # c3: nkerns[1] @ 1 * (s2 - recept_width[1] + 1) # s4 nkerns[1] @ 1 * c3 / pool_width input_layer1_width = (n_timesteps - recept_width[0] + 1) / pool_width[0] layer1 = ConvPoolLayer(rng, input=layer0.output, image_shape=(None, nkerns[0], 1, input_layer1_width), filter_shape=(nkerns[1], nkerns[0], 1, recept_width[1]), poolsize=(1, pool_width[1]), activation_function=activation_function) # s4:(batch_size, nkerns[1], 1, s4) -> flatten(2) -> (batch_size, nkerns[1]* 1 * s4) layer2_input = layer1.output.flatten(2) input_layer2_size = (input_layer1_width - recept_width[1] + 1) / pool_width[1] # c5: 120@1*1 self.layer2 = HiddenLayer(rng=rng, input=layer2_input, n_in=nkerns[1] * 1 * input_layer2_size, n_out=nkerns[2], training_mode=self.training_mode, dropout_prob=dropout_prob, activation_function=activation_function) # f6/output self.layer3 = LogisticRegressionLayer(input=self.layer2.output, n_in=nkerns[2], n_out=2, training_mode=self.training_mode, dropout_prob=dropout_prob) self.params = self.layer3.params + self.layer2.params + layer1.params + layer0.params
def __init__(self, rng, hiddenLayerList, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type hiddenLayerList: [HiddenLayer instances] :param hiddenLayerList: A list of hidden layers :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # connect hidden layers (no need to, they're already connected outside when building them) self.hiddenLayers=hiddenLayerList # prevLy=hiddenLayerList[0] # prevLy.input=input # for ly in hiddenLayerList[1:]: # ly.input=prevLy.output # prevLy=ly # The logistic regression layer gets as input the hidden units of the hidden layer self.logRegressionLayer = LogisticRegression( input=hiddenLayerList[-1].output, n_in=hiddenLayerList[-1].inOutDim[1], n_out=n_out) # symbolic variables for data self.X=self.hiddenLayers[0].input # training data self.y=T.bvector('y') # labels for training data # L1 norm ; one regularization option is to enforce L1 norm to be small self.L1 = abs(self.logRegressionLayer.W).sum() for ly in self.hiddenLayers: self.L1 += abs(ly.W).sum() # square of L2 norm ; one regularization option is to enforce square of L2 norm to be small self.L2_sqr = (self.logRegressionLayer.W ** 2).sum() for ly in self.hiddenLayers: self.L2_sqr += (ly.W ** 2).sum() # negative log likelihood of the MLP is given by the negative log likelihood of the output # of the model, computed in the logistic regression layer self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the all layers self.params=self.logRegressionLayer.params for ly in self.hiddenLayers: self.params+=ly.params
def __init__(self, nkerns, recept_width, pool_width, stride, dropout_prob, l2_reg, training_batch_size, activation, weights_variance, n_timesteps, dim, objective_function): self.training_batch_size = training_batch_size self.objective_function = objective_function rng = np.random.RandomState(23455) self.training_mode = T.iscalar('training_mode') self.x = T.matrix('x') self.y = T.bvector('y') self.batch_size = theano.shared(training_batch_size) self.input = self.x.reshape((self.batch_size, 1, dim, n_timesteps)) self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride, self.training_mode, dropout_prob[0], activation, weights_variance, n_timesteps, dim) self.classifier = LogisticRegressionLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1], training_mode=self.training_mode, dropout_prob=dropout_prob[1]) self.params = self.feature_extractor.params + self.classifier.params # ---------------------- BACKPROP if self.objective_function == 'cross_entropy': self.cost = self.classifier.cross_entropy_cost(self.y) elif self.objective_function == 'auc': self.cost = self.classifier.auc_cost(self.y) else: raise ValueError('wrong objective function') L2_sqr = sum((param ** 2).sum() for param in self.params[::2]) self.grads = T.grad(self.cost + l2_reg * L2_sqr, self.params) self.updates = self._adadelta_updates(self.grads) # --------------------- FUNCTIONS tp, tn, fp, fn = self.classifier.confusion_matrix(self.y) self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)], updates=self.updates) self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)], [self.cost, tp, tn, fp, fn]) self.test_model = theano.function([self.x, Param(self.training_mode, default=0)], self.classifier.p_y_given_x.flatten())
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X,aux=aux) error = - T.log(output[T.arange(Y.shape[0]),((128+1 + Y)%(128+1))]) error = error[-(Y.shape[0]/2):] parameters = P.values() gradients = T.grad(T.sum(error),wrt=parameters) shapes = [ p.get_value().shape for p in parameters ] count = theano.shared(np.float32(0)) acc_grads = [ theano.shared(np.zeros(s,dtype=np.float32)) for s in shapes ] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [ (g / count) for g in acc_grads ] avg_grads = [ clip(g,1) for g in acc_grads ] acc = theano.function( inputs=[X,Y], outputs=T.mean(error), updates = acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters,avg_grads,learning_rate=1e-8) + acc_clear ) test = theano.function( inputs=[X], outputs=T.argmax(output,axis=1)[-(X.shape[0]/2):], ) return acc,update,test
def __init__(self, game_params, arch_params, solver_params, trained_model, sn_dir): params=None if trained_model: params = common.load_params(trained_model) self.lr_func = create_learning_rate_func(solver_params) self.x_h_0 = tt.fvector('x_h_0') self.v_h_0 = tt.fvector('v_h_0') self.t_h_0 = tt.fvector('t_h_0') self.x_t_0 = tt.fmatrix('x_t_0') self.v_t_0 = tt.fmatrix('v_t_0') self.a_t_0 = tt.fmatrix('a_t_0') self.t_t_0 = tt.fvector('t_t_0') self.time_steps = tt.fvector('t_0') self.exist = tt.bvector('exist') self.is_leader = tt.fvector('is_leader') self.x_goal = tt.fvector('x_goal') self.turn_vec_h = tt.fvector('turn_vec_h') self.turn_vec_t = tt.fvector('turn_vec_t') self.n_steps = tt.iscalar('n_steps') self.lr = tt.fscalar('lr') self.sn_dir = sn_dir self.game_params = game_params self.arch_params = arch_params self.solver_params = solver_params self.model = CONTROLLER(self.x_h_0, self.v_h_0, self.t_h_0, self.x_t_0, self.v_t_0, self.a_t_0, self.t_t_0, self.time_steps, self.exist, self.is_leader, self.x_goal, self.turn_vec_h, self.turn_vec_t, self.n_steps, self.lr, self.game_params, self.arch_params, self.solver_params, params)
def __init__(self, state_format, actions_number, gamma=0.99, learning_rate=0.00025, ddqn=False, **kwargs): self.inputs = dict() self.learning_rate = learning_rate architecture = kwargs self.loss_history = [] self.misc_state_included = (state_format["s_misc"] > 0) self.gamma = np.float64(gamma) self.inputs["S0"] = tensor.tensor4("S0") self.inputs["S1"] = tensor.tensor4("S1") self.inputs["A"] = tensor.ivector("Action") self.inputs["R"] = tensor.vector("Reward") self.inputs["Nonterminal"] = tensor.bvector("Nonterminal") if self.misc_state_included: self.inputs["S0_misc"] = tensor.matrix("S0_misc") self.inputs["S1_misc"] = tensor.matrix("S1_misc") self.misc_len = state_format["s_misc"] else: self.misc_len = None # save it for the evaluation reshape # TODO get rid of this? self.single_image_input_shape = (1,) + tuple(state_format["s_img"]) architecture["img_input_shape"] = (None,) + tuple(state_format["s_img"]) architecture["misc_len"] = self.misc_len architecture["output_size"] = actions_number if self.misc_state_included: self.network, input_layers, _ = self._initialize_network(img_input=self.inputs["S0"], misc_input=self.inputs["S0_misc"], **architecture) self.frozen_network, _, alternate_inputs = self._initialize_network(img_input=self.inputs["S1"], misc_input=self.inputs["S1_misc"], **architecture) else: self.network, input_layers, _ = self._initialize_network(img_input=self.inputs["S0"], **architecture) self.frozen_network, _, alternate_inputs = self._initialize_network(img_input=self.inputs["S1"], **architecture) self.alternate_input_mappings = {} for layer, input in zip(input_layers, alternate_inputs): self.alternate_input_mappings[layer] = input # print "Network initialized." self._compile(ddqn)
def make_train_functions(): P = Parameters() X = T.bvector('X') Y = T.ivector('Y') aux = {} predict = model.build( P, input_size=128, embedding_size=64, controller_size=256, stack_size=256, output_size=128, ) output = predict(X, aux=aux) error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))]) error = error[-(Y.shape[0] / 2):] parameters = P.values() gradients = T.grad(T.sum(error), wrt=parameters) shapes = [p.get_value().shape for p in parameters] count = theano.shared(np.float32(0)) acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes] acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\ [ (count,count + np.float32(1)) ] acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\ [ (count,np.int32(0)) ] avg_grads = [(g / count) for g in acc_grads] avg_grads = [clip(g, 1) for g in acc_grads] acc = theano.function( inputs=[X, Y], outputs=T.mean(error), updates=acc_update, ) update = theano.function( inputs=[], updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) + acc_clear) test = theano.function( inputs=[X], outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):], ) return acc, update, test
def inputs(self): return {'call_type': tensor.bvector('call_type'), 'origin_call': tensor.ivector('origin_call'), 'origin_stand': tensor.bvector('origin_stand'), 'taxi_id': tensor.wvector('taxi_id'), 'timestamp': tensor.ivector('timestamp'), 'day_type': tensor.bvector('day_type'), 'missing_data': tensor.bvector('missing_data'), 'latitude': tensor.matrix('latitude'), 'longitude': tensor.matrix('longitude'), 'latitude_mask': tensor.matrix('latitude_mask'), 'longitude_mask': tensor.matrix('longitude_mask'), 'week_of_year': tensor.bvector('week_of_year'), 'day_of_week': tensor.bvector('day_of_week'), 'qhour_of_day': tensor.bvector('qhour_of_day'), 'destination_latitude': tensor.vector('destination_latitude'), 'destination_longitude': tensor.vector('destination_longitude')}
def inputs(self): return { 'call_type': tensor.bvector('call_type'), 'origin_call': tensor.ivector('origin_call'), 'origin_stand': tensor.bvector('origin_stand'), 'taxi_id': tensor.wvector('taxi_id'), 'timestamp': tensor.ivector('timestamp'), 'day_type': tensor.bvector('day_type'), 'missing_data': tensor.bvector('missing_data'), 'latitude': tensor.matrix('latitude'), 'longitude': tensor.matrix('longitude'), 'latitude_mask': tensor.matrix('latitude_mask'), 'longitude_mask': tensor.matrix('longitude_mask'), 'week_of_year': tensor.bvector('week_of_year'), 'day_of_week': tensor.bvector('day_of_week'), 'qhour_of_day': tensor.bvector('qhour_of_day'), 'destination_latitude': tensor.vector('destination_latitude'), 'destination_longitude': tensor.vector('destination_longitude') }
def __init__(self, nkerns, recept_width, pool_width, stride, dropout_prob, l2_reg, training_batch_size, activation, weights_variance, n_timesteps, dim): self.training_batch_size = training_batch_size rng = np.random.RandomState(23455) self.training_mode = T.iscalar('training_mode') self.x = T.matrix('x') self.y = T.bvector('y') self.batch_size = theano.shared(training_batch_size) self.input = self.x.reshape((self.batch_size, 1, dim, n_timesteps)) self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride, self.training_mode, dropout_prob[0], activation, weights_variance, n_timesteps, dim) self.classifier = SoftmaxLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1], n_out=3, training_mode=self.training_mode, dropout_prob=dropout_prob[1]) self.params = self.feature_extractor.params + self.classifier.params # ---------------------- BACKPROP self.cost = self.classifier.cross_entropy_cost(self.y) L2_sqr = sum((param ** 2).sum() for param in self.params[::2]) self.grads = T.grad(self.cost + l2_reg * L2_sqr, self.params) self.updates = self._adadelta_updates(self.grads) # --------------------- FUNCTIONS self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)], self.cost, updates=self.updates) self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)], self.cost) self.test_model = theano.function([self.x, Param(self.training_mode, default=0)], self.classifier.p_y_given_x)
def GetProbFunctions(num_features, learning_rate=1e-4, ret_updates=True): adjustment_var = T.bmatrix(name='Adjustment matrix') features_var = T.fmatrix(name='Features') mask_var = T.bvector(name='Filter mask') reward_var = T.scalar(name='Reward') net = BuildGraphNetwork(adjustment_var, features_var, mask_var, num_features) desc = lasagne.layers.get_output(net['desc']) prob = msoftmax(theano.gradient.grad_clip(desc, -1, 1)) reward_grad = reward_var / prob params = lasagne.layers.get_all_params(net['desc'], trainable=True) grads = theano.grad(None, params, known_grads={prob: reward_grad}) updates = lasagne.updates.momentum(grads, params, learning_rate=learning_rate) action_fn = theano.function([adjustment_var, features_var, mask_var], prob) if ret_updates: updates_fn = theano.function( [adjustment_var, features_var, mask_var, reward_var], [], updates=updates, allow_input_downcast=True) return net, action_fn, updates_fn else: return net, action_fn
def __init__(self, game_params, arch_params, solver_params, trained_model, sn_dir): params = None if trained_model: params = common.load_params(trained_model) self.lr_func = create_learning_rate_func(solver_params) self.x_h_0 = tt.fvector('x_h_0') self.v_h_0 = tt.fvector('v_h_0') self.t_h_0 = tt.fvector('t_h_0') self.x_t_0 = tt.fmatrix('x_t_0') self.v_t_0 = tt.fmatrix('v_t_0') self.a_t_0 = tt.fmatrix('a_t_0') self.t_t_0 = tt.fvector('t_t_0') self.time_steps = tt.fvector('t_0') self.exist = tt.bvector('exist') self.is_leader = tt.fvector('is_leader') self.x_goal = tt.fvector('x_goal') self.turn_vec_h = tt.fvector('turn_vec_h') self.turn_vec_t = tt.fvector('turn_vec_t') self.n_steps = tt.iscalar('n_steps') self.lr = tt.fscalar('lr') self.sn_dir = sn_dir self.game_params = game_params self.arch_params = arch_params self.solver_params = solver_params self.model = CONTROLLER(self.x_h_0, self.v_h_0, self.t_h_0, self.x_t_0, self.v_t_0, self.a_t_0, self.t_t_0, self.time_steps, self.exist, self.is_leader, self.x_goal, self.turn_vec_h, self.turn_vec_t, self.n_steps, self.lr, self.game_params, self.arch_params, self.solver_params, params)
def __init__(self, rng, n_in, n_out, n_h, n_layers, f_act=leaky_relu, obj='single', dropout_rate = 0): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param n_layers: Number of hidden layers (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function ''' if obj=='single': f_out = softmax elif obj=='multi': f_out = sigmoid self.x = T.vector() # construct hidden layers assert(n_layers>=1) first_hiddenLayer = HiddenLayer( rng=rng, input=self.x, predict_input=self.x, n_in=n_in, n_out=n_h, activation=f_act, dropout_rate = dropout_rate, nametag='0' ) self.hidden_layers = [first_hiddenLayer] self.p = first_hiddenLayer.params[:] for i in range(n_layers-1): cur_hiddenLayer = ResNetLayer( rng=rng, input=self.hidden_layers[-1].output, predict_input=self.hidden_layers[-1].predict_output, n_h=n_h, activation=f_act, dropout_rate = dropout_rate, nametag=str(i+1) ) self.hidden_layers.append(cur_hiddenLayer) self.p.extend(cur_hiddenLayer.params[:]) # params for output layer self.outputLayer = HiddenLayer( rng=rng, input=self.hidden_layers[-1].output, predict_input=self.hidden_layers[-1].predict_output, n_in=n_h, n_out=n_out, activation=f_out, dropout_rate = 0, nametag='o' ) self.p.extend(self.outputLayer.params[:]) self.n_layers = n_layers + 1 self.obj = obj if obj=='single': self.y = T.bscalar('y') self.o = self.outputLayer.output self.cost = T.nnet.categorical_crossentropy(self.o, T.eye(n_out)[self.y]) self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.) self.prediction = np.argmax(self.o) elif obj=='multi': self.y = T.bvector('y') self.o = self.outputLayer.output self.cost = T.nnet.binary_crossentropy(self.o, self.y).mean() self.prediction = T.argsort(self.o) self.accuracy = self.y[T.argmax(self.o)] self.accuracy3 = (1.0/3.0) * (self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]]) self.accuracy5 = (1.0/5.0) * (self.y[self.prediction[-5]]+self.y[self.prediction[-4]]+self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]]) self.optimiser = sgd_optimizer(self, 'ResNet')
def __init__(self, atari_env, state_dimension, action_dimension, monitor_env=False, learning_rate=0.001, critic_update=10, train_step=1, gamma=0.95, eps_max=1.0, eps_min=0.1, eps_decay=10000, n_epochs=10000, batch_size=32, buffer_size=50000): self.env = gym.make(atari_env) if monitor_env: None self.state_dimension = state_dimension self.action_dimension = action_dimension self.learning_rate = learning_rate self.critic_update = critic_update self.train_step = train_step self.gamma = gamma self.eps_max = eps_max self.eps_min = eps_min self.eps_decay = eps_decay self.n_epochs = n_epochs self.batch_size = batch_size self.buffer_size = buffer_size self.experience_replay = [] def q_network(state): input_state = InputLayer(input_var=state, shape=(None, self.state_dimension[0], self.state_dimension[1], self.state_dimension[2])) input_state = DimshuffleLayer(input_state, pattern=(0, 3, 1, 2)) conv = Conv2DLayer(input_state, num_filters=32, filter_size=(8, 8), stride=(4, 4), nonlinearity=rectify) conv = Conv2DLayer(conv, num_filters=64, filter_size=(4, 4), stride=(2, 2), nonlinearity=rectify) conv = Conv2DLayer(conv, num_filters=64, filter_size=(3, 3), stride=(1, 1), nonlinearity=rectify) flatten = FlattenLayer(conv) dense = DenseLayer(flatten, num_units=512, nonlinearity=rectify) q_values = DenseLayer(dense, num_units=self.action_dimension, nonlinearity=linear) return q_values self.X_state = T.ftensor4() self.X_action = T.bvector() self.X_reward = T.fvector() self.X_next_state = T.ftensor4() self.X_done = T.bvector() self.X_action_hot = to_one_hot(self.X_action, self.action_dimension) self.q_ = q_network(self.X_state) self.q = get_output(self.q_) self.q_target_ = q_network(self.X_next_state) self.q_target = get_output(self.q_target_) self.q_max = T.max(self.q_target, axis=1) self.action = T.argmax(self.q, axis=1) self.mu = theano.function(inputs=[self.X_state], outputs=self.action, allow_input_downcast=True) self.loss = squared_error( self.X_reward + self.gamma * self.q_max * (1.0 - self.X_done), T.batched_dot(self.q, self.X_action_hot)) self.loss = self.loss.mean() self.params = get_all_params(self.q_) self.grads = T.grad(self.loss, self.params) self.normed_grads = total_norm_constraint(self.grads, 1.0) self.updates = rmsprop(self.normed_grads, self.params, learning_rate=self.learning_rate) self.update_network = theano.function(inputs=[ self.X_state, self.X_action, self.X_reward, self.X_next_state, self.X_done ], outputs=self.loss, updates=self.updates, allow_input_downcast=True)
def __init__(self,n_hidden,embedding_dimention=50,feature_dimention=61): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 #repre_active = ReLU repre_active = linear self.params = [] self.w_embedding = init_weight_file(args.embedding,args.embedding_dimention) self.params.append(self.w_embedding) self.zp_x_pre_index = T.imatrix("zp_x_pre") self.zp_x_post_index = T.imatrix("zp_x_post") zp_x_pre_newshape = (T.shape(self.zp_x_pre_index)[0],args.embedding_dimention) self.embedding_sub_zp_pre = self.w_embedding[self.zp_x_pre_index.flatten()] self.zp_x_pre = T.reshape(self.embedding_sub_zp_pre,zp_x_pre_newshape) zp_x_post_newshape = (T.shape(self.zp_x_post_index)[0],args.embedding_dimention) self.embedding_sub_zp_post = self.w_embedding[self.zp_x_post_index.flatten()] self.zp_x_post = T.reshape(self.embedding_sub_zp_post,zp_x_post_newshape) zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre) self.params += zp_nn_pre.params zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post) self.params += zp_nn_post.params attention_pre_on_post = softmax((zp_nn_pre.nn_out*zp_nn_post.all_hidden).sum(axis=1))[0] attention_post_on_pre = softmax((zp_nn_post.nn_out*zp_nn_pre.all_hidden).sum(axis=1))[0] zp_post = T.sum(attention_pre_on_post[:,None]*zp_nn_post.all_hidden,axis=0) zp_pre = T.sum(attention_post_on_pre[:,None]*zp_nn_pre.all_hidden,axis=0) #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out)) self.zp_out = T.concatenate((zp_post,zp_pre)) self.zp_out_output = self.zp_out ### get sequence output for NP ### self.np_x_post_index = T.itensor3("np_x") self.np_x_postc_index = T.itensor3("np_x") self.np_x_pre_index = T.itensor3("np_x") self.np_x_prec_index = T.itensor3("np_x") np_x_post_newshape = (T.shape(self.np_x_post_index)[0],T.shape(self.np_x_post_index)[1],args.embedding_dimention) self.embedding_sub_np_x_post = self.w_embedding[self.np_x_post_index.flatten()] self.np_x_post = T.reshape(self.embedding_sub_np_x_post,np_x_post_newshape) np_x_postc_newshape = (T.shape(self.np_x_postc_index)[0],T.shape(self.np_x_postc_index)[1],args.embedding_dimention) self.embedding_sub_np_x_postc = self.w_embedding[self.np_x_postc_index.flatten()] self.np_x_postc = T.reshape(self.embedding_sub_np_x_postc,np_x_postc_newshape) np_x_pre_newshape = (T.shape(self.np_x_pre_index)[0],T.shape(self.np_x_pre_index)[1],args.embedding_dimention) self.embedding_sub_np_x_pre = self.w_embedding[self.np_x_pre_index.flatten()] self.np_x_pre = T.reshape(self.embedding_sub_np_x_pre,np_x_pre_newshape) np_x_prec_newshape = (T.shape(self.np_x_prec_index)[0],T.shape(self.np_x_prec_index)[1],args.embedding_dimention) self.embedding_sub_np_x_prec = self.w_embedding[self.np_x_prec_index.flatten()] self.np_x_prec = T.reshape(self.embedding_sub_np_x_prec,np_x_prec_newshape) self.mask_pre = T.matrix("mask") self.mask_prec = T.matrix("mask") self.mask_post = T.matrix("mask") self.mask_postc = T.matrix("mask") self.np_nn_pre = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_pre,self.np_x_prec,self.mask_pre,self.mask_prec) self.params += self.np_nn_pre.params self.np_nn_post = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_post,self.np_x_postc,self.mask_post,self.mask_postc) self.params += self.np_nn_post.params self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate((self.np_nn_post_output,self.np_nn_pre_output),axis=1) np_nn_f = LSTM(n_hidden*2,n_hidden*2,self.np_out) self.params += np_nn_f.params np_nn_b = LSTM(n_hidden*2,n_hidden*2,self.np_out[::-1]) self.params += np_nn_b.params self.bi_np_out = T.concatenate((np_nn_f.all_hidden,np_nn_b.all_hidden[::-1]),axis=1) self.np_out_output = self.bi_np_out #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output]) self.feature = T.matrix("feature") self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active) self.params += self.feature_layer.params w_attention_zp,b_attention = init_weight(n_hidden*2,1,pre="attention_zp",ones=False) self.params += [w_attention_zp,b_attention] #w_attention_np,b_u = init_weight(n_hidden*2,1,pre="attention_np",ones=False) #self.params += [w_attention_np] w_attention_np_rnn,b_u = init_weight(n_hidden*4,1,pre="attention_np_rnn",ones=False) self.params += [w_attention_np_rnn] w_attention_feature,b_u = init_weight(n_hidden,1,pre="attention_feature",ones=False) self.params += [w_attention_feature] #self.calcu_attention = tanh(T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention) self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention,axes=(1,0)))[0] self.out = self.attention self.get_out = theano.function(inputs=[self.zp_x_pre_index,self.zp_x_post_index,self.np_x_pre_index,self.np_x_prec_index,self.np_x_post_index,self.np_x_postc_index,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature],outputs=[self.out],on_unused_input='warn') l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 t = T.bvector() cost = -(T.log((self.out*t).sum())) lr = T.scalar() updates = lasagne.updates.sgd(cost, self.params, lr) #updates = lasagne.updates.adadelta(cost, self.params) self.train_step = theano.function( inputs=[self.zp_x_pre_index,self.zp_x_post_index,self.np_x_pre_index,self.np_x_prec_index,self.np_x_post_index,self.np_x_postc_index,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature,t,lr], outputs=[cost], on_unused_input='warn', updates=updates)
def fit(self, data, sample_store=10000000): ''' Trains the network. Parameters -------- data : pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). sample_store : int If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary). This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM. For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely). ''' self.predict = None self.error_during_train = False itemids = data[self.item_key].unique() self.n_items = len(itemids) self.itemidmap = pd.Series(data=np.arange(self.n_items), index=itemids) data = pd.merge(data, pd.DataFrame({self.item_key:itemids, 'ItemIdx':self.itemidmap[itemids].values}), on=self.item_key, how='inner') offset_sessions = self.init(data) if self.n_sample: pop = data.groupby('ItemId').size() pop = pop[self.itemidmap.index.values].values**self.sample_alpha pop = pop.cumsum() / pop.sum() pop[-1] = 1 if sample_store: generate_length = sample_store // self.n_sample if generate_length <= 1: sample_store = 0 print('No example store was used') else: neg_samples = self.generate_neg_samples(pop, generate_length) sample_pointer = 0 else: print('No example store was used') X = T.ivector() Y = T.ivector() M = T.iscalar() R = T.bvector() H_new, Y_pred, sparams, full_params, sidxs = self.model(X, self.H, M, R, Y, self.dropout_p_hidden, self.dropout_p_embed) cost = (M/self.batch_size) * self.loss_function(Y_pred, M) params = [self.Wx if self.embedding or self.constrained_embedding else self.Wx[1:], self.Wh, self.Wrz, self.Bh] updates = self.RMSprop(cost, params, full_params, sparams, sidxs) for i in range(len(self.H)): updates[self.H[i]] = H_new[i] train_function = function(inputs=[X, Y, M, R], outputs=cost, updates=updates, allow_input_downcast=True) base_order = np.argsort(data.groupby(self.session_key)[self.time_key].min().values) if self.time_sort else np.arange(len(offset_sessions)-1) data_items = data.ItemIdx.values for epoch in range(self.n_epochs): for i in range(len(self.layers)): self.H[i].set_value(np.zeros((self.batch_size,self.layers[i]), dtype=theano.config.floatX), borrow=True) c = [] cc = [] session_idx_arr = np.random.permutation(len(offset_sessions)-1) if self.train_random_order else base_order iters = np.arange(self.batch_size) maxiter = iters.max() start = offset_sessions[session_idx_arr[iters]] end = offset_sessions[session_idx_arr[iters]+1] finished = False while not finished: minlen = (end-start).min() out_idx = data_items[start] for i in range(minlen-1): in_idx = out_idx out_idx = data_items[start+i+1] if self.n_sample: if sample_store: if sample_pointer == generate_length: neg_samples = self.generate_neg_samples(pop, generate_length) sample_pointer = 0 sample = neg_samples[sample_pointer] sample_pointer += 1 else: sample = self.generate_neg_samples(pop, 1) y = np.hstack([out_idx, sample]) else: y = out_idx if self.n_sample: if sample_pointer == generate_length: generate_samples() sample_pointer = 0 sample_pointer += 1 reset = (start+i+1 == end-1) cost = train_function(in_idx, y, len(iters), reset) c.append(cost) cc.append(len(iters)) if np.isnan(cost): print(str(epoch) + ': NaN error!') self.error_during_train = True return start = start+minlen-1 finished_mask = (end-start<=1) n_finished = finished_mask.sum() iters[finished_mask] = maxiter + np.arange(1,n_finished+1) maxiter += n_finished valid_mask = (iters < len(offset_sessions)-1) n_valid = valid_mask.sum() if (n_valid == 0) or (n_valid < 2 and self.n_sample == 0): finished = True break mask = finished_mask & valid_mask sessions = session_idx_arr[iters[mask]] start[mask] = offset_sessions[sessions] end[mask] = offset_sessions[sessions+1] iters = iters[valid_mask] start = start[valid_mask] end = end[valid_mask] if n_valid < len(valid_mask): for i in range(len(self.H)): tmp = self.H[i].get_value(borrow=True) tmp = tmp[valid_mask] self.H[i].set_value(tmp, borrow=True) c = np.array(c) cc = np.array(cc) avgc = np.sum(c * cc) / np.sum(cc) if np.isnan(avgc): print('Epoch {}: NaN error!'.format(str(epoch))) self.error_during_train = True return print('Epoch{}\tloss: {:.6f}'.format(epoch, avgc))
def _init_model(self, in_size, out_size, slot_sizes, db, \ n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \ inputtype='full', sl='e2e', rl='e2e'): self.in_size = in_size self.out_size = out_size self.slot_sizes = slot_sizes self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid self.r_hid = self.n_hid self.sl = sl self.rl = rl table = db.table counts = db.counts m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots] prior = [db.priors[s] for s in dialog_config.inform_slots] unknown = [db.unks[s] for s in dialog_config.inform_slots] ids = [db.ids[s] for s in dialog_config.inform_slots] input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \ T.btensor3('am'), T.fvector('r') T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable( counts) db_index_var = T.imatrix('db') db_index_switch = T.bvector('s') l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask) flat_mask = T.reshape(turn_mask, (turn_mask.shape[0] * turn_mask.shape[1], 1)) def _smooth(p): p_n = p + EPS return p_n / (p_n.sum(axis=1)[:, np.newaxis]) def _add_unk(p, m, N): # p: B x V, m- num missing, N- total, p0: 1 x V t_unk = T.as_tensor_variable(float(m) / N) ps = p * (1. - t_unk) return T.concatenate([ps, T.tile(t_unk, (ps.shape[0], 1))], axis=1) def kl_divergence(p, q): p_n = _smooth(p) return -T.sum(q * T.log(p_n), axis=1) # belief tracking l_in = L.InputLayer(shape=(None, None, self.in_size), input_var=input_var) p_vars = [] pu_vars = [] phi_vars = [] p_targets = [] phi_targets = [] hid_in_vars = [] hid_out_vars = [] bt_loss = T.as_tensor_variable(0.) kl_loss = [] x_loss = [] self.trackers = [] for i, s in enumerate(dialog_config.inform_slots): hid_in = T.fmatrix('h') l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in, \ mask_input=l_mask_in, grad_clipping=10.) # B x H x D l_b_in = L.ReshapeLayer(l_rnn, (input_var.shape[0] * input_var.shape[1], self.r_hid)) # BH x D hid_out = L.get_output(l_rnn)[:, -1, :] p_targ = T.ftensor3('p_target_' + s) p_t = T.reshape( p_targ, (p_targ.shape[0] * p_targ.shape[1], self.slot_sizes[i])) phi_targ = T.fmatrix('phi_target' + s) phi_t = T.reshape(phi_targ, (phi_targ.shape[0] * phi_targ.shape[1], 1)) l_b = L.DenseLayer(l_b_in, self.slot_sizes[i], nonlinearity=lasagne.nonlinearities.softmax) l_phi = L.DenseLayer(l_b_in, 1, nonlinearity=lasagne.nonlinearities.sigmoid) phi = T.clip(L.get_output(l_phi), 0.01, 0.99) p = L.get_output(l_b) p_u = _add_unk(p, m_unk[i], db.N) kl_loss.append( T.sum(flat_mask.flatten() * kl_divergence(p, p_t)) / T.sum(flat_mask)) x_loss.append( T.sum(flat_mask * lasagne.objectives.binary_crossentropy(phi, phi_t)) / T.sum(flat_mask)) bt_loss += kl_loss[-1] + x_loss[-1] p_vars.append(p) pu_vars.append(p_u) phi_vars.append(phi) p_targets.append(p_targ) phi_targets.append(phi_targ) hid_in_vars.append(hid_in) hid_out_vars.append(hid_out) self.trackers.append(l_b) self.trackers.append(l_phi) self.bt_params = L.get_all_params(self.trackers) def check_db(pv, phi, Tb, N): O = T.alloc(0., pv[0].shape[0], Tb.shape[0]) # BH x T.shape[0] for i, p in enumerate(pv): p_dc = T.tile(phi[i], (1, Tb.shape[0])) O += T.log(p_dc*(1./db.table.shape[0]) + \ (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i])) Op = T.exp(O) #+EPS # BH x T.shape[0] Os = T.sum(Op, axis=1)[:, np.newaxis] # BH x 1 return Op / Os def entropy(p): p = _smooth(p) return -T.sum(p * T.log(p), axis=-1) def weighted_entropy(p, q, p0, unks, idd): w = T.dot(idd, q.transpose()) # Pi x BH u = p0[np.newaxis, :] * (q[:, unks].sum(axis=1)[:, np.newaxis] ) # BH x Pi p_tilde = w.transpose() + u return entropy(p_tilde) p_db = check_db(pu_vars, phi_vars, T_var, N_var) # BH x T.shape[0] if inputtype == 'entropy': H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \ for i,pv in enumerate(p_vars)] H_db = entropy(p_db) phv = [ph[:, 0] for ph in phi_vars] t_in = T.stacklists(H_vars + phv + [H_db]).transpose() # BH x 2M+1 t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x 2M+1 l_in_pol = L.InputLayer( shape=(None,None,2*len(dialog_config.inform_slots)+1), \ input_var=t_in_resh) else: in_reshaped = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1], \ input_var.shape[2])) prev_act = in_reshaped[:, -len(dialog_config.inform_slots):] t_in = T.concatenate(pu_vars + phi_vars + [p_db, prev_act], axis=1) # BH x D-sum+A t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x D-sum l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \ 3*len(dialog_config.inform_slots)+ \ table.shape[0]), input_var=t_in_resh) pol_in = T.fmatrix('pol-h') l_pol_rnn = L.GRULayer(l_in_pol, n_hid, hid_init=pol_in, mask_input=l_mask_in, grad_clipping=10.) # B x H x D pol_out = L.get_output(l_pol_rnn)[:, -1, :] l_den_in = L.ReshapeLayer( l_pol_rnn, (turn_mask.shape[0] * turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, \ nonlinearity=lasagne.nonlinearities.softmax) # BH x A self.network = l_out self.pol_params = L.get_all_params(self.network) self.params = self.bt_params + self.pol_params # db loss p_db_reshaped = T.reshape( p_db, (turn_mask.shape[0], turn_mask.shape[1], table.shape[0])) p_db_final = p_db_reshaped[:, -1, :] # B x T.shape[0] p_db_final = _smooth(p_db_final) ix = T.tile(T.arange(p_db_final.shape[0]), (db_index_var.shape[1], 1)).transpose() sample_probs = p_db_final[ix, db_index_var] # B x K if dialog_config.SUCCESS_MAX_RANK == 1: log_db_probs = T.log(sample_probs).sum(axis=1) else: cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \ outputs_info=T.zeros_like(sample_probs[:,0]), \ sequences=sample_probs[:,:-1].transpose()) cum_probs = T.clip(cum_probs.transpose(), 0., 1. - 1e-5) # B x K-1 log_db_probs = T.log(sample_probs).sum( axis=1) - T.log(1. - cum_probs).sum(axis=1) # B log_db_probs = log_db_probs * db_index_switch # rl probs = L.get_output(self.network) # BH x A probs = _smooth(probs) out_probs = T.reshape(probs, (turn_mask.shape[0], turn_mask.shape[1], self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs * act_mask).sum(axis=2) # B x H ep_probs = (act_probs * turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1) # B self.act_loss = -T.mean(ep_probs * reward_var) self.db_loss = -T.mean(log_db_probs * reward_var) self.reg_loss = -T.mean(ment * H_probs) self.loss = self.act_loss + self.db_loss + self.reg_loss self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \ pol_in] + hid_in_vars self.obj_fn = theano.function(self.inps, self.loss, on_unused_input='warn') self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \ [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn') self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss], on_unused_input='warn') self._rl_train_fn(self.learning_rate) ## sl sl_loss = 0. + bt_loss - T.mean(ep_probs) if self.sl == 'e2e': sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) elif self.sl == 'bel': sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) else: sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) sl_inps = [input_var, turn_mask, act_mask, pol_in ] + p_targets + phi_targets + hid_in_vars self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \ on_unused_input='warn') self.sl_obj_fn = theano.function(sl_inps, sl_loss, on_unused_input='warn')
def __init__( self, rng, batchsize=100, activation=relu ): import char_load (num_sent, char_cnt, word_cnt, max_word_len, max_sen_len,\ k_chr, k_wrd, x_chr, x_wrd, y) = char_load.read("tweets_clean.txt") dim_word = 30 dim_char = 5 cl_word = 300 cl_char = 50 k_word = k_wrd k_char = k_chr data_train_word,\ data_test_word,\ data_train_char,\ data_test_char,\ target_train,\ target_test\ = train_test_split(x_wrd, x_chr, y, random_state=1234, test_size=0.1) x_train_word = theano.shared(np.asarray(data_train_word, dtype='int16'), borrow=True) x_train_char = theano.shared(np.asarray(data_train_char, dtype='int16'), borrow=True) y_train = theano.shared(np.asarray(target_train, dtype='int8'), borrow=True) x_test_word = theano.shared(np.asarray(data_test_word, dtype='int16'), borrow=True) x_test_char = theano.shared(np.asarray(data_test_char, dtype='int16'), borrow=True) y_test = theano.shared(np.asarray(target_test, dtype='int8'), borrow=True) self.n_train_batches = x_train_word.get_value(borrow=True).shape[0] / batchsize self.n_test_batches = x_test_word.get_value(borrow=True).shape[0] / batchsize """symbol definition""" index = T.iscalar() x_wrd = T.wmatrix('x_wrd') x_chr = T.wtensor3('x_chr') y = T.bvector('y') train = T.iscalar('train') """network definition""" layer_char_embed_input = x_chr#.reshape((batchsize, max_sen_len, max_word_len)) layer_char_embed = EmbedIDLayer( rng, layer_char_embed_input, n_input=char_cnt, n_output=dim_char ) layer1_input = layer_char_embed.output.reshape( (batchsize*max_sen_len, 1, max_word_len, dim_char) ) layer1 = ConvolutionalLayer( rng, layer1_input, filter_shape=(cl_char, 1, k_char, dim_char),# cl_charフィルタ数 image_shape=(batchsize*max_sen_len, 1, max_word_len, dim_char) ) layer2 = MaxPoolingLayer( layer1.output, poolsize=(max_word_len-k_char+1, 1) ) layer_word_embed_input = x_wrd #.reshape((batchsize, max_sen_len)) layer_word_embed = EmbedIDLayer( rng, layer_word_embed_input, n_input=word_cnt, n_output=dim_word ) layer3_word_input = layer_word_embed.output.reshape((batchsize, 1, max_sen_len, dim_word)) layer3_char_input = layer2.output.reshape((batchsize, 1, max_sen_len, cl_char)) layer3_input = T.concatenate( [layer3_word_input, layer3_char_input], axis=3 )#.reshape((batchsize, 1, max_sen_len, dim_word+cl_char)) layer3 = ConvolutionalLayer( rng, layer3_input, filter_shape=(cl_word, 1, k_word, dim_word + cl_char),#1は入力チャネル数 image_shape=(batchsize, 1, max_sen_len, dim_word + cl_char), activation=activation ) layer4 = MaxPoolingLayer( layer3.output, poolsize=(max_sen_len-k_word+1, 1) ) layer5_input = layer4.output.reshape((batchsize, cl_word)) layer5 = FullyConnectedLayer( rng, dropout(rng, layer5_input, train), n_input=cl_word, n_output=50, activation=activation ) layer6_input = layer5.output layer6 = FullyConnectedLayer( rng, dropout(rng, layer6_input, train, p=0.1), n_input=50, n_output=2, activation=None ) result = Result(layer6.output, y) loss = result.negative_log_likelihood() accuracy = result.accuracy() params = layer6.params\ +layer5.params\ +layer3.params\ +layer_word_embed.params\ +layer1.params\ +layer_char_embed.params updates = RMSprop(learning_rate=0.001, params=params).updates(loss) self.train_model = theano.function( inputs=[index], outputs=[loss, accuracy], updates=updates, givens={ x_wrd: x_train_word[index*batchsize: (index+1)*batchsize], x_chr: x_train_char[index*batchsize: (index+1)*batchsize], y: y_train[index*batchsize: (index+1)*batchsize], train: np.cast['int32'](1) } ) self.test_model = theano.function( inputs=[index], outputs=[loss, accuracy], givens={ x_wrd: x_test_word[index*batchsize: (index+1)*batchsize], x_chr: x_test_char[index*batchsize: (index+1)*batchsize], y: y_test[index*batchsize: (index+1)*batchsize], train: np.cast['int32'](0) } )
def __init__(self, n_hidden, embedding_dimention=50, feature_dimention=61): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 self.params = [] self.zp_x_pre = T.matrix("zp_x_pre") self.zp_x_post = T.matrix("zp_x_post") zp_nn_pre = LSTM(embedding_dimention, n_hidden, self.zp_x_pre) #zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre_dropout) self.params += zp_nn_pre.params zp_nn_post = LSTM(embedding_dimention, n_hidden, self.zp_x_post) #zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post_dropout) self.params += zp_nn_post.params self.zp_out = T.concatenate((zp_nn_pre.nn_out, zp_nn_post.nn_out)) self.zp_out_output = self.zp_out ### get sequence output for NP ### self.np_x_post = T.tensor3("np_x") self.np_x_postc = T.tensor3("np_x") self.np_x_pre = T.tensor3("np_x") self.np_x_prec = T.tensor3("np_x") self.mask_pre = T.matrix("mask") self.mask_prec = T.matrix("mask") self.mask_post = T.matrix("mask") self.mask_postc = T.matrix("mask") self.np_nn_pre = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_pre, self.np_x_prec, self.mask_pre, self.mask_prec) self.params += self.np_nn_pre.params self.np_nn_post = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_post, self.np_x_postc, self.mask_post, self.mask_postc) self.params += self.np_nn_post.params self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate( (self.np_nn_post_output, self.np_nn_pre_output), axis=1) #np_nn_f = LSTM(n_hidden*2,n_hidden*2,self.np_out) np_nn_f = RNN(n_hidden * 2, n_hidden * 2, self.np_out) self.params += np_nn_f.params #np_nn_b = LSTM(n_hidden*2,n_hidden*2,self.np_out[::-1]) np_nn_b = RNN(n_hidden * 2, n_hidden * 2, self.np_out[::-1]) self.params += np_nn_b.params self.bi_np_out = T.concatenate( (np_nn_f.all_hidden, np_nn_b.all_hidden[::-1]), axis=1) self.np_out_output = self.bi_np_out self.get_np_out = theano.function(inputs=[ self.np_x_pre, self.np_x_prec, self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc ], outputs=[self.np_out_output]) #self.feature = T.matrix("feature") #self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active) #self.params += self.feature_layer.params w_attention_zp, b_attention = init_weight(n_hidden * 2, 1, pre="attention_zp", ones=False) self.params += [w_attention_zp, b_attention] w_attention_np, b_u = init_weight(n_hidden * 2, 1, pre="attention_np", ones=False) self.params += [w_attention_np] w_attention_np_rnn, b_u = init_weight(n_hidden * 4, 1, pre="attention_np_rnn", ones=False) self.params += [w_attention_np_rnn] #w_attention_feature,b_u = init_weight(n_hidden,1,pre="attention_feature",ones=False) #self.params += [w_attention_feature] #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention) self.calcu_attention = tanh( T.dot(self.np_out_output, w_attention_np_rnn) + T.dot(self.zp_out_output, w_attention_zp) + T.dot(self.np_out, w_attention_np) + b_attention) #self.calcu_attention = tanh(T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention, axes=(1, 0)))[0] #self.attention = T.transpose(self.calcu_attention,axes=(1,0))[0] t = T.bvector() max_attention = (self.attention * t).max() self.out = self.attention self.get_out = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec, self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc ], outputs=[self.out], on_unused_input='warn') self.get_max = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec, self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc, t ], outputs=[max_attention], on_unused_input='warn') l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 cost = ((1 - t) * (1 - max_attention + self.out)).sum() #cost = -(T.log((self.out*t).sum())) lr = T.scalar() updates = lasagne.updates.sgd(cost, self.params, lr) #updates = lasagne.updates.adadelta(cost, self.params) #updates = lasagne.updates.adam(cost, self.params) self.train_step = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec, self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc, t, lr ], outputs=[cost], on_unused_input='warn', updates=updates)
def __init__(self, nkerns, recept_width, pool_width, dropout_prob, training_batch_size, activation, n_timesteps=1000, dim=18): if activation == 'tanh': activation_function = lambda x: T.tanh(x) elif activation == 'relu': activation_function = lambda x: T.maximum(0.0, x) else: raise ValueError('unknown activation function') self.training_batch_size = training_batch_size rng = np.random.RandomState(23455) self.training_mode = T.iscalar('training_mode') self.x = T.matrix('x') self.y = T.bvector('y') self.batch_size = theano.shared(self.training_batch_size) # 18@1*1000 self.layer0_input = self.x.reshape( (self.batch_size, dim, 1, n_timesteps)) # image 18 @ 1*1000 # c1: nkerns[0] @ 1* (1000 - recept_width[0] + 1) # s2: nkerns[0] @ 1 * c1 / pool_width[0] layer0 = ConvPoolLayer(rng, input=self.layer0_input, image_shape=(None, dim, 1, n_timesteps), filter_shape=(nkerns[0], dim, 1, recept_width[0]), poolsize=(1, pool_width[0]), activation_function=activation_function) # c3: nkerns[1] @ 1 * (s2 - recept_width[1] + 1) # s4 nkerns[1] @ 1 * c3 / pool_width input_layer1_width = (n_timesteps - recept_width[0] + 1) / pool_width[0] layer1 = ConvPoolLayer(rng, input=layer0.output, image_shape=(None, nkerns[0], 1, input_layer1_width), filter_shape=(nkerns[1], nkerns[0], 1, recept_width[1]), poolsize=(1, pool_width[1]), activation_function=activation_function) # s4:(batch_size, nkerns[1], 1, s4) -> flatten(2) -> (batch_size, nkerns[1]* 1 * s4) layer2_input = layer1.output.flatten(2) input_layer2_size = (input_layer1_width - recept_width[1] + 1) / pool_width[1] # c5: 120@1*1 self.layer2 = HiddenLayer(rng=rng, input=layer2_input, n_in=nkerns[1] * 1 * input_layer2_size, n_out=nkerns[2], training_mode=self.training_mode, dropout_prob=dropout_prob, activation_function=activation_function) # f6/output self.layer3 = LogisticRegressionLayer(input=self.layer2.output, n_in=nkerns[2], n_out=2, training_mode=self.training_mode, dropout_prob=dropout_prob) self.params = self.layer3.params + self.layer2.params + layer1.params + layer0.params
def initialize(self, policy, env_spec, sample_size, horizon, mid_batch_reset): if mid_batch_reset and policy.recurrent: raise NotImplementedError obs = env_spec.observation_space.new_tensor_variable('obs', extra_dims=1) act = env_spec.action_space.new_tensor_variable('act', extra_dims=1) adv = T.vector('adv') ret = T.vector('ret') old_value = T.vector('old_value') dist = policy.distribution old_dist_info = { k: T.matrix('old_%s' % k) for k in dist.dist_info_keys } self._dist_info_keys = dist.dist_info_keys state_info = {k: T.matrix(k) for k in policy.state_info_keys} self._state_info_keys = policy.state_info_keys new_dist_info = policy.dist_info_sym(obs, state_info_vars=state_info) new_value = policy.value_sym(obs, state_info_vars=state_info) self._lr_mult = theano.shared(np.array(1., dtype=theano.config.floatX), name='lr_mult') if mid_batch_reset and not policy.recurrent: self._use_valids = False valids = None # will be ignored inside valids_mean() else: self._use_valids = True valids = T.bvector('valids') # dtype int8 v_err = (new_value - ret)**2 v_loss = self.v_loss_coeff * valids_mean(v_err, valids) ent = policy.distribution.entropy_sym(new_dist_info) ent_loss = -self.ent_loss_coeff * valids_mean(ent, valids) pi_loss = \ self.pi_loss(policy, act, adv, old_dist_info, new_dist_info, valids) losses = (pi_loss, v_loss, ent_loss) pi_kl = valids_mean(dist.kl_sym(old_dist_info, new_dist_info), valids) v_kl = valids_mean((new_value - old_value)**2, valids) constraints = (pi_kl, v_kl) input_list = [obs, act, adv, ret, old_value] old_dist_info_list = [old_dist_info[k] for k in dist.dist_info_keys] state_info_list = [state_info[k] for k in policy.state_info_keys] input_list += old_dist_info_list + state_info_list opt_examples = dict( advantages=np.array(1, dtype=adv.dtype), returns=np.array(1, dtype=ret.dtype), ) if self._use_valids: input_list.append(valids) opt_examples["valids"] = np.array(1, dtype=np.int8) self.optimizer.initialize( inputs=input_list, losses=losses, constraints=constraints, target=policy, lr_mult=self._lr_mult, ) self._opt_buf = buffer_with_segs_view(opt_examples, sample_size, horizon, shared=False) self._batch_size = sample_size self._mid_batch_reset = mid_batch_reset self._horizon = horizon self.policy = policy
def inputs(self): return { 'call_type': tensor.bvector('call_type'), 'origin_call': tensor.ivector('origin_call'), 'origin_stand': tensor.bvector('origin_stand'), 'taxi_id': tensor.wvector('taxi_id'), 'timestamp': tensor.ivector('timestamp'), 'day_type': tensor.bvector('day_type'), 'missing_data': tensor.bvector('missing_data'), 'latitude': tensor.matrix('latitude'), 'longitude': tensor.matrix('longitude'), 'destination_latitude': tensor.vector('destination_latitude'), 'destination_longitude': tensor.vector('destination_longitude'), 'travel_time': tensor.ivector('travel_time'), 'first_k_latitude': tensor.matrix('first_k_latitude'), 'first_k_longitude': tensor.matrix('first_k_longitude'), 'last_k_latitude': tensor.matrix('last_k_latitude'), 'last_k_longitude': tensor.matrix('last_k_longitude'), 'input_time': tensor.ivector('input_time'), 'week_of_year': tensor.bvector('week_of_year'), 'day_of_week': tensor.bvector('day_of_week'), 'qhour_of_day': tensor.bvector('qhour_of_day'), 'candidate_call_type': tensor.bvector('candidate_call_type'), 'candidate_origin_call': tensor.ivector('candidate_origin_call'), 'candidate_origin_stand': tensor.bvector('candidate_origin_stand'), 'candidate_taxi_id': tensor.wvector('candidate_taxi_id'), 'candidate_timestamp': tensor.ivector('candidate_timestamp'), 'candidate_day_type': tensor.bvector('candidate_day_type'), 'candidate_missing_data': tensor.bvector('candidate_missing_data'), 'candidate_latitude': tensor.matrix('candidate_latitude'), 'candidate_longitude': tensor.matrix('candidate_longitude'), 'candidate_destination_latitude': tensor.vector('candidate_destination_latitude'), 'candidate_destination_longitude': tensor.vector('candidate_destination_longitude'), 'candidate_travel_time': tensor.ivector('candidate_travel_time'), 'candidate_first_k_latitude': tensor.matrix('candidate_first_k_latitude'), 'candidate_first_k_longitude': tensor.matrix('candidate_first_k_longitude'), 'candidate_last_k_latitude': tensor.matrix('candidate_last_k_latitude'), 'candidate_last_k_longitude': tensor.matrix('candidate_last_k_longitude'), 'candidate_input_time': tensor.ivector('candidate_input_time'), 'candidate_week_of_year': tensor.bvector('candidate_week_of_year'), 'candidate_day_of_week': tensor.bvector('candidate_day_of_week'), 'candidate_qhour_of_day': tensor.bvector('candidate_qhour_of_day') }
def get_updates_functions(self): tind = T.ivector('ind') if self.NMF_updates == 'beta': print "Standard rules for beta-divergence" H_update = T.set_subtensor(self.H[tind[3]:tind[4], ], updates.beta_H(self.X_buff[tind[1]:tind[2], ], self.W[tind[0]], self.H[tind[3]:tind[4], ], self.beta)) W_update = T.set_subtensor(self.W[tind[0]], updates.beta_W(self.X_buff[tind[1]:tind[2], ], self.W[tind[0]], self.H[tind[3]:tind[4], ], self.beta)) self.trainH = theano.function(inputs=[tind], outputs=[], updates={self.H: H_update}, name="trainH", allow_input_downcast=True) self.trainW = theano.function(inputs=[tind], outputs=[], updates={self.W: W_update}, name="trainW", allow_input_downcast=True) if self.NMF_updates == 'groupNMF': tcomp = T.ivector('comp') tlambda = T.fvector('lambda') tcard = T.bvector('card') print "Group NMF with class specific rules for beta-divergence" if self.dist_mode=='iter': tparams = [tind, tcomp, tlambda, tcard] print "Compute contraint distances once per iteration" H_update = T.set_subtensor(self.H[tind[3]:tind[4], ], updates.group_H(self.X_buff[tind[1]:tind[2], ], self.W[tind[0]], self.H, self.beta, tparams)) W_update = T.set_subtensor(self.W[tind[0]], updates.group_W_nosum(self.X_buff[tind[1]:tind[2], ], self.W, self.H[tind[3]:tind[4], ], self.cls_sums[tind[5]], self.ses_sums[tind[6]], self.beta, tparams)) self.trainH = theano.function(inputs=[tind, tcomp, tlambda, tcard], outputs=[], updates={self.H: H_update}, name="trainH", on_unused_input='ignore', allow_input_downcast=True) self.trainW = theano.function(inputs=[tind, tcomp, tlambda, tcard], outputs=[], updates={self.W: W_update}, name="trainW", on_unused_input='ignore', allow_input_downcast=True) else: print "Compute contraint distances at each segment update" tSc = T.ivector('Sc') tCs = T.ivector('Cs') tparams = [tind, tcomp, tlambda, tSc, tCs, tcard] H_update = T.set_subtensor(self.H[tind[3]:tind[4], ], updates.group_H(self.X_buff[tind[1]:tind[2], ], self.W[tind[0]], self.H, self.beta, tparams)) W_update = T.set_subtensor(self.W[tind[0]], updates.group_W(self.X_buff[tind[1]:tind[2], ], self.W, self.H[tind[3]:tind[4], ], self.beta, tparams)) self.trainH = theano.function(inputs=[tind, tcomp, tlambda, tSc, tCs, tcard], outputs=[], updates={self.H: H_update}, name="trainH", on_unused_input='ignore', allow_input_downcast=True) self.trainW = theano.function(inputs=[tind, tcomp, tlambda, tSc, tCs, tcard], outputs=[], updates={self.W: W_update}, name="trainW", on_unused_input='ignore', allow_input_downcast=True) if self.NMF_updates == 'noiseNMF': tcomp = T.ivector('comp') tlambda = T.fvector('lambda') tcard = T.bvector('card') print "Group NMF with noise reference rules for beta-divergence" tSc = T.ivector('Sc') tCs = T.ivector('Cs') tparams = [tind, tcomp, tlambda, tSc, tCs, tcard] H_update = T.set_subtensor(self.H[tind[3]:tind[4], ], updates.group_H(self.X_buff[tind[1]:tind[2], ], self.W[tind[0]], self.H, self.beta, tparams)) W_update = T.set_subtensor(self.W[tind[0]], updates.noise_W(self.X_buff[tind[1]:tind[2], ], self.W, self.Wn, self.H[tind[3]:tind[4], ], self.beta, tparams)) self.trainH = theano.function(inputs=[tind, tcomp, tlambda, tSc, tCs, tcard], outputs=[], updates={self.H: H_update}, name="trainH", on_unused_input='ignore', allow_input_downcast=True) self.trainW = theano.function(inputs=[tind, tcomp, tlambda, tSc, tCs, tcard], outputs=[], updates={self.W: W_update}, name="trainW", on_unused_input='ignore', allow_input_downcast=True)
def create_iter_functions(dataset, output_layer, batch_size=MINIBATCH_SIZE ): print("Creating IterFunctions...") batch_index = T.iscalar('batch_index') X_batch = T.imatrix('x') # See http://stackoverflow.com/questions/25166657/index-gymnastics-inside-a-theano-function # And https://bitbucket.org/kostialopuhin/word-models/src/ba4b00bb03c7eee83b11dc729fd4f6a58ab21fb6/word_embeddings.py?at=default vectors = dataset['language']['vectors'] #X_batch_flat_vectors = vectors[X_batch].reshape( (X_batch.shape[0], -1) ) # next line is more explicit, for safety X_batch_flat_vectors = vectors[X_batch].reshape( (X_batch.shape[0], vectors.shape[1]*X_batch.shape[1] ) ) #Y_batch = T.ivector('y') Y_batch = T.bvector('y') # This is smaller... batch_slice = slice( batch_index * batch_size, (batch_index + 1) * batch_size ) # Output layer vector position assignment : # a = NotMissing # b = Missing (complex) # c-x = Missing a simple word (take shift into account) def loss(output): # This pulls out log(output) at the correct index position for each element of the mini-batch, # and takes the mean return -T.mean(T.log(output)[T.arange(Y_batch.shape[0]), Y_batch]) loss_train = loss(output_layer.get_output(X_batch_flat_vectors)) loss_eval = loss(output_layer.get_output(X_batch_flat_vectors, deterministic=True)) # deterministic=True turns off dropout # But this (for the first runs) easy to model as a soft-max thing # from 0=(nogap), 1=(complex), 2..(small_limit+2)=small-word pred = T.argmax( output_layer.get_output(X_batch_flat_vectors, deterministic=True), axis=1 ) accuracy = T.mean(T.eq(pred, Y_batch), dtype=theano.config.floatX) # Would otherwise use float64 all_params = lasagne.layers.get_all_params(output_layer) #updates = lasagne.updates.nesterov_momentum( # loss_train, all_params, learning_rate, momentum #) #def adagrad(loss, all_params, learning_rate=1.0, epsilon=1e-6): #updates = lasagne.updates.adagrad( # loss_train, all_params #, learning_rate, momentum #) #def adadelta(loss, all_params, learning_rate=1.0, rho=0.95, epsilon=1e-6): updates = lasagne.updates.adadelta( loss_train, all_params #, learning_rate, momentum ) iters={} if 'train' in dataset: d=dataset['train'] iters['train'] = theano.function( [batch_index], loss_train, updates=updates, givens={ X_batch: d['X'][batch_slice], Y_batch: d['Y'][batch_slice], }, ) if 'valid' in dataset: d=dataset['valid'] iters['valid'] = theano.function( [batch_index], [loss_eval, accuracy], givens={ X_batch: d['X'][batch_slice], Y_batch: d['Y'][batch_slice], }, ) if 'test' in dataset: d=dataset['test'] iters['test'] = theano.function( [batch_index], [loss_eval, accuracy], givens={ X_batch: d['X'][batch_slice], Y_batch: d['Y'][batch_slice], }, ) return iters
def construct_network(context,characters,hidden): print "Setting up memory..." X = T.bmatrix('X') Y = T.bvector('Y') zeros = np.zeros(characters,dtype=np.int8) zeros[0] = 1 zeros[1] = 1 alpha = T.cast(T.fscalar('alpha'),dtype=theano.config.floatX) lr = T.cast(T.fscalar('lr'),dtype=theano.config.floatX) Ws_char_to_hidden = [ U.create_shared( U.initial_weights(characters,hidden), name='char[%d]'%i ) for i in xrange(context) ] mat = Ws_char_to_hidden[0].get_value() mat[0] = 0 Ws_char_to_hidden[0].set_value(mat) W_hidden_to_hidden_i = U.create_shared(U.initial_weights(hidden,hidden) + np.eye(hidden)) b_hidden_i = U.create_shared(U.initial_weights(hidden)) W_hidden_to_hidden_o = U.create_shared(U.initial_weights(hidden,hidden) + np.eye(hidden)) b_hidden_o = U.create_shared(U.initial_weights(hidden)) W_hidden_to_predict = U.create_shared(U.initial_weights(hidden,characters)) b_predict = U.create_shared(U.initial_weights(characters)) W_predict_to_hidden = U.create_shared(U.initial_weights(characters,hidden)) gen_weight_mask = U.create_shared(zeros,name='mask') print "Constructing graph..." hidden_inputs = make_char_outputs(X,Ws_char_to_hidden) hidden_outputs,predictions = make_hidden_predict_outputs( hidden,characters, hidden_inputs, gen_weight_mask[X[:,0]], W_hidden_to_hidden_i, b_hidden_i, W_hidden_to_hidden_o, b_hidden_o, W_hidden_to_predict, b_predict, W_predict_to_hidden ) weights = Ws_char_to_hidden + [ W_hidden_to_hidden_i, b_hidden_i, W_hidden_to_hidden_o, b_hidden_o, W_hidden_to_predict, b_predict, W_predict_to_hidden ] cost = -T.mean(T.log(predictions)[T.arange(Y.shape[0]),Y]) gparams = T.grad(cost,weights) deltas = [ U.create_shared(np.zeros(w.get_value().shape)) for w in weights ] updates = [ ( param, param - ( alpha * delta + gparam * lr ) ) for param,delta,gparam in zip(weights,deltas,gparams) ] + [ ( delta, alpha * delta + gparam * lr) for delta,gparam in zip(deltas,gparams) ] return X,Y,alpha,lr,updates,predictions,weights
num_units = n_state, nonlinearity = tanh, W = Normal(0.1, 0.0), b = Constant(0.0)) q_values = DenseLayer(dense_2, num_units = n_action, nonlinearity = None, W = Normal(0.1, 0.0), b = Constant(0.0)) return q_values X_next_state = T.fmatrix() X_state = T.fmatrix() X_action = T.bvector() X_reward = T.fvector() X_done = T.bvector() X_action_hot = to_one_hot(X_action, n_action) q_ = q_network(X_state); q = get_output(q_) q_target_ = q_network(X_next_state); q_target = get_output(q_target_) q_max = T.max(q_target, axis=1) action = T.argmax(q, axis=1) mu = theano.function(inputs = [X_state], outputs = action, allow_input_downcast = True) loss = squared_error(X_reward + gamma * q_max * (1.0 - X_done), T.batched_dot(q, X_action_hot))
def __init__(self,n_hidden,embedding_dimention=50,feature_dimention=61): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 #repre_active = ReLU repre_active = linear self.params = [] self.zp_x_pre = T.matrix("zp_x_pre") self.zp_x_post = T.matrix("zp_x_post") zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre) self.params += zp_nn_pre.params zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post) self.params += zp_nn_post.params danwei = theano.shared(np.eye(8, dtype=theano.config.floatX)) H_pre = zp_nn_pre.all_hidden H_post = zp_nn_post.all_hidden Ws1_pre,heihei = init_weight(n_hidden,n_hidden,pre="Ws1_pre_zp",ones=False) Ws2_pre,heihei = init_weight(8,n_hidden,pre="Ws2_pre_zp",ones=False) self.params += [Ws1_pre,Ws2_pre] A_pre = softmax(T.dot(Ws2_pre,T.dot(Ws1_pre,T.transpose(H_pre)))) P_pre = T.dot(A_pre,T.transpose(A_pre))-danwei f_norm_pre = (P_pre**2).sum() zp_out_pre = T.mean(T.dot(A_pre,H_pre),axis=0) Ws1_post,heihei = init_weight(n_hidden,n_hidden,pre="Ws1_post_zp",ones=False) Ws2_post,heihei = init_weight(8,n_hidden,pre="Ws2_post_zp",ones=False) self.params += [Ws1_post,Ws2_post] A_post = softmax(T.dot(Ws2_post,T.dot(Ws1_post,T.transpose(H_post)))) P_post = T.dot(A_post,T.transpose(A_post))-danwei f_norm_post = (P_post**2).sum() zp_out_post = T.mean(T.dot(A_post,H_post),axis=0) f_norm = f_norm_pre + f_norm_post #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out)) self.zp_out = T.concatenate((zp_out_pre,zp_out_post)) self.zp_out_output = self.zp_out ### get sequence output for NP ### self.np_x_post = T.tensor3("np_x") self.np_x_postc = T.tensor3("np_x") self.np_x_pre = T.tensor3("np_x") self.np_x_prec = T.tensor3("np_x") self.mask_pre = T.matrix("mask") self.mask_prec = T.matrix("mask") self.mask_post = T.matrix("mask") self.mask_postc = T.matrix("mask") self.np_nn_pre = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_pre,self.np_x_prec,self.mask_pre,self.mask_prec) self.params += self.np_nn_pre.params self.np_nn_post = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_post,self.np_x_postc,self.mask_post,self.mask_postc) self.params += self.np_nn_post.params self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate((self.np_nn_post_output,self.np_nn_pre_output),axis=1) #np_nn_f = LSTM(n_hidden*2,n_hidden*2,self.np_out) #self.params += np_nn_f.params #np_nn_b = LSTM(n_hidden*2,n_hidden*2,self.np_out[::-1]) #self.params += np_nn_b.params #self.bi_np_out = T.concatenate((np_nn_f.all_hidden,np_nn_b.all_hidden[::-1]),axis=1) #self.np_out_output = self.bi_np_out #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output]) self.feature = T.matrix("feature") self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active) self.params += self.feature_layer.params w_attention_zp,b_attention = init_weight(n_hidden*2,1,pre="attention_zp",ones=False) self.params += [w_attention_zp,b_attention] w_attention_np,b_u = init_weight(n_hidden*2,1,pre="attention_np",ones=False) self.params += [w_attention_np] #w_attention_np_rnn,b_u = init_weight(n_hidden*4,1,pre="attention_np_rnn",ones=False) #self.params += [w_attention_np_rnn] w_attention_feature,b_u = init_weight(n_hidden,1,pre="attention_feature",ones=False) self.params += [w_attention_feature] self.calcu_attention = tanh(T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention,axes=(1,0)))[0] self.out = self.attention self.get_out = theano.function(inputs=[self.zp_x_pre,self.zp_x_post,self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature],outputs=[self.out],on_unused_input='warn') l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 t = T.bvector() cost = -(T.log((self.out*t).sum())) + f_norm lr = T.scalar() updates = lasagne.updates.sgd(cost, self.params, lr) #updates = lasagne.updates.adadelta(cost, self.params) self.train_step = theano.function( inputs=[self.zp_x_pre,self.zp_x_post,self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature,t,lr], outputs=[cost], on_unused_input='warn', updates=updates)
def __init__( self, rng, batchsize=100, activation=relu ): import char_load (num_sent, char_cnt, word_cnt, max_word_len, max_sen_len, \ k_chr, k_wrd, x_chr, x_wrd, y) = char_load.read("tweets_clean.txt") dim_word = 30 dim_char = 5 cl_word = 300 cl_char = 50 k_word = k_wrd k_char = k_chr data_train_word, \ data_test_word, \ data_train_char, \ data_test_char, \ target_train, \ target_test \ = train_test_split(x_wrd, x_chr, y, random_state=1234, test_size=0.1) x_train_word = theano.shared(np.asarray(data_train_word, dtype='int16'), borrow=True) x_train_char = theano.shared(np.asarray(data_train_char, dtype='int16'), borrow=True) y_train = theano.shared(np.asarray(target_train, dtype='int8'), borrow=True) x_test_word = theano.shared(np.asarray(data_test_word, dtype='int16'), borrow=True) x_test_char = theano.shared(np.asarray(data_test_char, dtype='int16'), borrow=True) y_test = theano.shared(np.asarray(target_test, dtype='int8'), borrow=True) self.n_train_batches = x_train_word.get_value(borrow=True).shape[0] / batchsize self.n_test_batches = x_test_word.get_value(borrow=True).shape[0] / batchsize """symbol definition""" index = T.iscalar() x_wrd = T.wmatrix('x_wrd') x_chr = T.wtensor3('x_chr') y = T.bvector('y') train = T.iscalar('train') """network definition""" layer_char_embed_input = x_chr # .reshape((batchsize, max_sen_len, max_word_len)) layer_char_embed = EmbedIDLayer( rng, layer_char_embed_input, n_input=char_cnt, n_output=dim_char ) layer1_input = layer_char_embed.output.reshape( (batchsize * max_sen_len, 1, max_word_len, dim_char) ) layer1 = ConvolutionalLayer( rng, layer1_input, filter_shape=(cl_char, 1, k_char, dim_char), # cl_charフィルタ数 image_shape=(batchsize * max_sen_len, 1, max_word_len, dim_char) ) layer2 = MaxPoolingLayer( layer1.output, poolsize=(max_word_len - k_char + 1, 1) ) layer_word_embed_input = x_wrd # .reshape((batchsize, max_sen_len)) layer_word_embed = EmbedIDLayer( rng, layer_word_embed_input, n_input=word_cnt, n_output=dim_word ) layer3_word_input = layer_word_embed.output.reshape((batchsize, 1, max_sen_len, dim_word)) layer3_char_input = layer2.output.reshape((batchsize, 1, max_sen_len, cl_char)) layer3_input = T.concatenate( [layer3_word_input, layer3_char_input], axis=3 ) # .reshape((batchsize, 1, max_sen_len, dim_word+cl_char)) layer3 = ConvolutionalLayer( rng, layer3_input, filter_shape=(cl_word, 1, k_word, dim_word + cl_char), # 1は入力チャネル数 image_shape=(batchsize, 1, max_sen_len, dim_word + cl_char), activation=activation ) layer4 = MaxPoolingLayer( layer3.output, poolsize=(max_sen_len - k_word + 1, 1) ) layer5_input = layer4.output.reshape((batchsize, cl_word)) layer5 = FullyConnectedLayer( rng, dropout(rng, layer5_input, train), n_input=cl_word, n_output=50, activation=activation ) layer6_input = layer5.output layer6 = FullyConnectedLayer( rng, dropout(rng, layer6_input, train, p=0.1), n_input=50, n_output=2, activation=None ) result = Result(layer6.output, y) loss = result.negative_log_likelihood() accuracy = result.accuracy() params = layer6.params \ + layer5.params \ + layer3.params \ + layer_word_embed.params \ + layer1.params \ + layer_char_embed.params updates = RMSprop(learning_rate=0.001, params=params).updates(loss) self.train_model = theano.function( inputs=[index], outputs=[loss, accuracy], updates=updates, givens={ x_wrd: x_train_word[index * batchsize: (index + 1) * batchsize], x_chr: x_train_char[index * batchsize: (index + 1) * batchsize], y: y_train[index * batchsize: (index + 1) * batchsize], train: np.cast['int32'](1) } ) self.test_model = theano.function( inputs=[index], outputs=[loss, accuracy], givens={ x_wrd: x_test_word[index * batchsize: (index + 1) * batchsize], x_chr: x_test_char[index * batchsize: (index + 1) * batchsize], y: y_test[index * batchsize: (index + 1) * batchsize], train: np.cast['int32'](0) } )
def __init__(self, n_hidden, embedding_dimention=50): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 ##n_hidden_sequence: sequence lstm的隐层维度 因为要同zp的结合做dot,所以其维度要是n_hidden的2倍 ## 即 n_hidden_sequence = 2 * n_hidden self.params = [] self.zp_x_pre = T.matrix("zp_x_pre") self.zp_x_post = T.matrix("zp_x_post") #self.zp_x_pre_dropout = _dropout_from_layer(self.zp_x_pre) #self.zp_x_post_dropout = _dropout_from_layer(self.zp_x_post) zp_nn_pre = GRU(embedding_dimention, n_hidden, self.zp_x_pre) #zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre_dropout) self.params += zp_nn_pre.params zp_nn_post = GRU(embedding_dimention, n_hidden, self.zp_x_post) #zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post_dropout) self.params += zp_nn_post.params self.zp_out = T.concatenate((zp_nn_pre.nn_out, zp_nn_post.nn_out)) self.ZP_layer = Layer(n_hidden * 2, n_hidden * 2, self.zp_out, ReLU) self.zp_out_output = self.ZP_layer.output #self.zp_out_dropout = _dropout_from_layer(T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out))) self.get_zp_out = theano.function( inputs=[self.zp_x_pre, self.zp_x_post], outputs=[self.ZP_layer.output]) ### get sequence output for NP ### self.np_x = T.tensor3("np_x") self.np_x_post = T.tensor3("np_x") self.np_x_pre = T.tensor3("np_x") #self.np_x_dropout = _dropout_from_layer(self.np_x) self.mask = T.matrix("mask") self.mask_pre = T.matrix("mask") self.mask_post = T.matrix("mask") self.np_nn_x = RNN_batch(embedding_dimention, n_hidden, self.np_x, self.mask) self.params += self.np_nn_x.params self.np_nn_pre = GRU_batch(embedding_dimention, n_hidden, self.np_x_pre, self.mask_pre) self.params += self.np_nn_pre.params self.np_nn_post = GRU_batch(embedding_dimention, n_hidden, self.np_x_post, self.mask_post) self.params += self.np_nn_post.params #self.np_nn_out = LSTM_batch(embedding_dimention,n_hidden*2,self.np_x,self.mask) #self.np_nn_out = LSTM_batch(embedding_dimention,n_hidden*2,self.np_x_dropout,self.mask) #self.params += self.np_nn_out.params #self.np_out = self.np_nn.nn_out self.np_nn_x_output = (self.np_nn_x.all_hidden).mean(axis=1) self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate( (self.np_nn_x_output, self.np_nn_post_output, self.np_nn_pre_output), axis=1) self.NP_layer = Layer(n_hidden * 3, n_hidden * 2, self.np_out, ReLU) self.np_out_output = self.NP_layer.output self.np_x_head = T.transpose(self.np_x, axes=(1, 0, 2))[-1] self.get_np_head = theano.function(inputs=[self.np_x], outputs=[self.np_x_head]) self.get_np = theano.function(inputs=[ self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.np_out]) self.get_np_out = theano.function(inputs=[ self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.np_out_output]) w_attention_zp, b_attention = init_weight(n_hidden * 2, 1, pre="attention_hidden", ones=False) self.params += [w_attention_zp, b_attention] w_attention_np, b_u = init_weight(n_hidden * 2, 1, pre="attention_zp", ones=False) self.params += [w_attention_np] self.calcu_attention = tanh( T.dot(self.np_out_output, w_attention_np) + T.dot(self.zp_out_output, w_attention_zp) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention, axes=(1, 0)))[0] self.get_attention = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.attention]) new_zp = T.sum(self.attention[:, None] * self.np_x_head, axis=0) self.get_new_zp = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[new_zp]) #### *** HOP *** #### self.w_hop_zp, self.b_hop_zp = init_weight(n_hidden * 2 + embedding_dimention, n_hidden * 2, pre="hop_") self.params += [self.w_hop_zp, self.b_hop_zp] ## hop 1 ## self.zp_hop_1_init = T.concatenate( (zp_nn_pre.nn_out, zp_nn_post.nn_out, new_zp)) self.zp_hop_1 = ReLU( T.dot(self.zp_hop_1_init, self.w_hop_zp) + self.b_hop_zp) self.calcu_attention_hop_1 = tanh( T.dot(self.np_out_output, w_attention_np) + T.dot(self.zp_hop_1, w_attention_zp) + b_attention) self.attention_hop_1 = softmax( T.transpose(self.calcu_attention_hop_1, axes=(1, 0)))[0] self.get_attention_hop_1 = theano.function( inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.attention_hop_1]) self.out = self.attention_hop_1 self.get_out = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post ], outputs=[self.out]) l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 t = T.bvector() cost = -(T.log((self.out * t).sum())) #cost = -(T.log((self.out_dropout*t).sum())) #cost = 1-((self.out*t).sum()) lr = T.scalar() #grads = T.grad(cost, self.params) #updates = [(param, param-lr*grad) # for param, grad in zip(self.params, grads)] #updates = lasagne.updates.sgd(cost, self.params, lr) updates = lasagne.updates.adadelta(cost, self.params) self.train_step = theano.function(inputs=[ self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre, self.mask_post, t, lr ], outputs=[cost], on_unused_input='warn', updates=updates)
def build_loss(self, env_spec, policy): obs = env_spec.observation_space.new_tensor_variable('obs', extra_dims=1) next_obs = env_spec.observation_space.new_tensor_variable('next_obs', extra_dims=1) act = env_spec.action_space.new_tensor_variable('act', extra_dims=1) ret = T.vector('disc_n_return') term = T.bvector('terminal') if self.prioritized_replay: isw = T.vector('importance_sample_weights') z_np = np.linspace(self.V_min, self.V_max, policy.n_atoms, dtype=theano.config.floatX) z = theano.shared(z_np) z_contracted = theano.shared( (self.discount**self.reward_horizon) * z_np) policy.incorporate_z( z) # (policy sets n_atoms, but algo sets vmin,vmax) delta_z = (self.V_max - self.V_min) / (policy.n_atoms - 1) # Yeah this is difficult to read and know if it's right. # (tested it vs numpy loop and numpy vectorized form in another script) z_contracted_bc = z_contracted.dimshuffle('x', 0) # (bc: broadcast) z_cntrct_term = (1 - term.dimshuffle(0, 'x')) * z_contracted_bc # z_cntrct_term is 2D tensor, with contracted z-values repeated for # each data point (each row), and zero'd wherever terminal is True ret_bc = ret.dimshuffle(0, 'x') z_next = T.clip(ret_bc + z_cntrct_term, self.V_min, self.V_max) # each row (data entry) in z_next had all z_values shifted by # corresponding return # must compare every pair of base z atom with next z atom z_next_bc = z_next.dimshuffle(0, 1, 'x') z_bc = z.dimshuffle('x', 'x', 0) abs_diff_on_delta = abs(z_next_bc - z_bc) / delta_z projection_coeffs = T.clip(1 - abs_diff_on_delta, 0, 1) # (mostly 0's) # projection coefficients is a 3-D tensor. # dim-0: independent data entries (gets scanned/looped over in batched_dot) # dim-1: corresponds to z_next atoms (gets summed over in batched_dot) # dim-2: corresponds to base z atoms (becomes dim-1 after batched_dot) if self.double_dqn: next_act = policy.actions_sym(next_obs) next_Z = policy.target_Z_at_a_sym(next_obs, next_act) else: next_Z = policy.target_max_Z_sym(next_obs) # lower case z refers to the domain of atoms, # capital Z refers to the probabilities for given state and action # projected_next_Z = T.batched_dot(next_Z, projection_coeffs) # NOTE: use of batched_dot somehow breaks the gradient (Theano 0.9); # so, do the broadcasting and summing manually (until Theano 1.0) next_Z_bc = T.shape_padright(next_Z) next_Z_x_coeff = projection_coeffs * next_Z_bc projected_next_Z = next_Z_x_coeff.sum(axis=1) predicted_Z = policy.Z_at_a_sym(obs, act) predicted_Z = T.clip(predicted_Z, 1e-6, 1) # (NaN-guard) losses = -T.sum(projected_next_Z * T.log(predicted_Z), axis=1) # CrossEnt if self.prioritized_replay: losses = isw * losses loss = T.mean(losses) projected_next_Z = T.clip(projected_next_Z, 1e-6, 1) # (NaN-guard) KL_divs = T.sum( projected_next_Z * T.log(projected_next_Z / predicted_Z), axis=1, ) KL_divs = T.clip(KL_divs, 1e-6, 1e6) # avoid < 0 from NaN-guard input_list = [obs, next_obs, act, ret, term] if self.prioritized_replay: input_list.append(isw) return input_list, loss, KL_divs
def fit(self, data, test=None, sample_store=10000000): ''' Trains the network. Parameters -------- data : pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). sample_store : int If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary). This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM. For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely). ''' self.predict = None self.error_during_train = False itemids = data[self.item_key].unique() self.n_items = len(itemids) self.itemidmap = pd.Series(data=np.arange(self.n_items), index=itemids) data = pd.merge(data, pd.DataFrame({ self.item_key: itemids, 'ItemIdx': self.itemidmap[itemids].values }), on=self.item_key, how='inner') offset_sessions = self.init_data(data) if self.n_sample: pop = data.groupby(self.item_key).size() pop = pop[self.itemidmap.index.values].values**self.sample_alpha pop = pop.cumsum() / pop.sum() pop[-1] = 1 if sample_store: generate_length = sample_store // self.n_sample if generate_length <= 1: sample_store = 0 print('No example store was used') else: neg_samples = self.generate_neg_samples( pop, generate_length) sample_pointer = 0 else: print('No example store was used') X = T.ivector() Y = T.ivector() M = T.iscalar() R = T.bvector() H_new, Y_pred, sparams, full_params, sidxs = self.model( X, self.H, M, R, Y, self.dropout_p_hidden, self.dropout_p_embed) cost = (M / self.batch_size) * self.loss_function(Y_pred, M) params = [ self.Wx if self.embedding or self.constrained_embedding else self.Wx[1:], self.Wh, self.Wrz, self.Bh ] updates = self.RMSprop(cost, params, full_params, sparams, sidxs) for i in range(len(self.H)): updates[self.H[i]] = H_new[i] train_function = function(inputs=[X, Y, M, R], outputs=cost, updates=updates, allow_input_downcast=True) base_order = np.argsort( data.groupby(self.session_key)[self.time_key].min().values ) if self.time_sort else np.arange(len(offset_sessions) - 1) data_items = data.ItemIdx.values for epoch in range(self.n_epochs): sc = time.clock() st = time.time() for i in range(len(self.layers)): self.H[i].set_value(np.zeros((self.batch_size, self.layers[i]), dtype=theano.config.floatX), borrow=True) c = [] cc = [] session_idx_arr = np.random.permutation( len(offset_sessions) - 1) if self.train_random_order else base_order iters = np.arange(self.batch_size) maxiter = iters.max() start = offset_sessions[session_idx_arr[iters]] end = offset_sessions[session_idx_arr[iters] + 1] finished = False while not finished: minlen = (end - start).min() out_idx = data_items[start] for i in range(minlen - 1): in_idx = out_idx out_idx = data_items[start + i + 1] if self.n_sample: if sample_store: if sample_pointer == generate_length: neg_samples = self.generate_neg_samples( pop, generate_length) sample_pointer = 0 sample = neg_samples[sample_pointer] sample_pointer += 1 else: sample = self.generate_neg_samples(pop, 1) y = np.hstack([out_idx, sample]) else: y = out_idx if self.n_sample: if sample_pointer == generate_length: generate_samples() sample_pointer = 0 sample_pointer += 1 reset = (start + i + 1 == end - 1) cost = train_function(in_idx, y, len(iters), reset) c.append(cost) cc.append(len(iters)) if np.isnan(cost): print(str(epoch) + ': NaN error!') self.error_during_train = True return start = start + minlen - 1 finished_mask = (end - start <= 1) n_finished = finished_mask.sum() iters[finished_mask] = maxiter + np.arange(1, n_finished + 1) maxiter += n_finished valid_mask = (iters < len(offset_sessions) - 1) n_valid = valid_mask.sum() if (n_valid == 0) or (n_valid < 2 and self.n_sample == 0): finished = True break mask = finished_mask & valid_mask sessions = session_idx_arr[iters[mask]] start[mask] = offset_sessions[sessions] end[mask] = offset_sessions[sessions + 1] iters = iters[valid_mask] start = start[valid_mask] end = end[valid_mask] if n_valid < len(valid_mask): for i in range(len(self.H)): tmp = self.H[i].get_value(borrow=True) tmp = tmp[valid_mask] self.H[i].set_value(tmp, borrow=True) c = np.array(c) cc = np.array(cc) avgc = np.sum(c * cc) / np.sum(cc) if np.isnan(avgc): print('Epoch {}: NaN error!'.format(str(epoch))) self.error_during_train = True return print('Epoch{}\tloss: {:.6f}'.format(epoch, avgc), 'time: ', (time.clock() - sc), 'c / ', (time.time() - st), 's')
def _init_model(self, in_size, out_size, slot_sizes, db, \ n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \ inputtype='full', sl='e2e', rl='e2e'): self.in_size = in_size self.out_size = out_size self.slot_sizes = slot_sizes self.batch_size = batch_size self.learning_rate = learning_rate_rl self.n_hid = n_hid self.r_hid = self.n_hid self.sl = sl self.rl = rl table = db.table counts = db.counts m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots] prior = [db.priors[s] for s in dialog_config.inform_slots] unknown = [db.unks[s] for s in dialog_config.inform_slots] ids = [db.ids[s] for s in dialog_config.inform_slots] input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \ T.btensor3('am'), T.fvector('r') T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable(counts) db_index_var = T.imatrix('db') db_index_switch = T.bvector('s') l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask) flat_mask = T.reshape(turn_mask, (turn_mask.shape[0]*turn_mask.shape[1],1)) def _smooth(p): p_n = p+EPS return p_n/(p_n.sum(axis=1)[:,np.newaxis]) def _add_unk(p,m,N): # p: B x V, m- num missing, N- total, p0: 1 x V t_unk = T.as_tensor_variable(float(m)/N) ps = p*(1.-t_unk) return T.concatenate([ps, T.tile(t_unk, (ps.shape[0],1))], axis=1) def kl_divergence(p,q): p_n = _smooth(p) return -T.sum(q*T.log(p_n), axis=1) # belief tracking l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var) p_vars = [] pu_vars = [] phi_vars = [] p_targets = [] phi_targets = [] hid_in_vars = [] hid_out_vars = [] bt_loss = T.as_tensor_variable(0.) kl_loss = [] x_loss = [] self.trackers = [] for i,s in enumerate(dialog_config.inform_slots): hid_in = T.fmatrix('h') l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in, \ mask_input=l_mask_in, grad_clipping=10.) # B x H x D l_b_in = L.ReshapeLayer(l_rnn, (input_var.shape[0]*input_var.shape[1], self.r_hid)) # BH x D hid_out = L.get_output(l_rnn)[:,-1,:] p_targ = T.ftensor3('p_target_'+s) p_t = T.reshape(p_targ, (p_targ.shape[0]*p_targ.shape[1],self.slot_sizes[i])) phi_targ = T.fmatrix('phi_target'+s) phi_t = T.reshape(phi_targ, (phi_targ.shape[0]*phi_targ.shape[1], 1)) l_b = L.DenseLayer(l_b_in, self.slot_sizes[i], nonlinearity=lasagne.nonlinearities.softmax) l_phi = L.DenseLayer(l_b_in, 1, nonlinearity=lasagne.nonlinearities.sigmoid) phi = T.clip(L.get_output(l_phi), 0.01, 0.99) p = L.get_output(l_b) p_u = _add_unk(p, m_unk[i], db.N) kl_loss.append(T.sum(flat_mask.flatten()*kl_divergence(p, p_t))/T.sum(flat_mask)) x_loss.append(T.sum(flat_mask*lasagne.objectives.binary_crossentropy(phi,phi_t))/ T.sum(flat_mask)) bt_loss += kl_loss[-1] + x_loss[-1] p_vars.append(p) pu_vars.append(p_u) phi_vars.append(phi) p_targets.append(p_targ) phi_targets.append(phi_targ) hid_in_vars.append(hid_in) hid_out_vars.append(hid_out) self.trackers.append(l_b) self.trackers.append(l_phi) self.bt_params = L.get_all_params(self.trackers) def check_db(pv, phi, Tb, N): O = T.alloc(0.,pv[0].shape[0],Tb.shape[0]) # BH x T.shape[0] for i,p in enumerate(pv): p_dc = T.tile(phi[i], (1, Tb.shape[0])) O += T.log(p_dc*(1./db.table.shape[0]) + \ (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i])) Op = T.exp(O)#+EPS # BH x T.shape[0] Os = T.sum(Op, axis=1)[:,np.newaxis] # BH x 1 return Op/Os def entropy(p): p = _smooth(p) return -T.sum(p*T.log(p), axis=-1) def weighted_entropy(p,q,p0,unks,idd): w = T.dot(idd,q.transpose()) # Pi x BH u = p0[np.newaxis,:]*(q[:,unks].sum(axis=1)[:,np.newaxis]) # BH x Pi p_tilde = w.transpose()+u return entropy(p_tilde) p_db = check_db(pu_vars, phi_vars, T_var, N_var) # BH x T.shape[0] if inputtype=='entropy': H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \ for i,pv in enumerate(p_vars)] H_db = entropy(p_db) phv = [ph[:,0] for ph in phi_vars] t_in = T.stacklists(H_vars+phv+[H_db]).transpose() # BH x 2M+1 t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x 2M+1 l_in_pol = L.InputLayer( shape=(None,None,2*len(dialog_config.inform_slots)+1), \ input_var=t_in_resh) else: in_reshaped = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1], \ input_var.shape[2])) prev_act = in_reshaped[:,-len(dialog_config.inform_slots):] t_in = T.concatenate(pu_vars+phi_vars+[p_db,prev_act], axis=1) # BH x D-sum+A t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \ t_in.shape[1])) # B x H x D-sum l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \ 3*len(dialog_config.inform_slots)+ \ table.shape[0]), input_var=t_in_resh) pol_in = T.fmatrix('pol-h') l_pol_rnn = L.GRULayer(l_in_pol, n_hid, hid_init=pol_in, mask_input=l_mask_in, grad_clipping=10.) # B x H x D pol_out = L.get_output(l_pol_rnn)[:,-1,:] l_den_in = L.ReshapeLayer(l_pol_rnn, (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D l_out = L.DenseLayer(l_den_in, self.out_size, \ nonlinearity=lasagne.nonlinearities.softmax) # BH x A self.network = l_out self.pol_params = L.get_all_params(self.network) self.params = self.bt_params + self.pol_params # db loss p_db_reshaped = T.reshape(p_db, (turn_mask.shape[0],turn_mask.shape[1],table.shape[0])) p_db_final = p_db_reshaped[:,-1,:] # B x T.shape[0] p_db_final = _smooth(p_db_final) ix = T.tile(T.arange(p_db_final.shape[0]),(db_index_var.shape[1],1)).transpose() sample_probs = p_db_final[ix,db_index_var] # B x K if dialog_config.SUCCESS_MAX_RANK==1: log_db_probs = T.log(sample_probs).sum(axis=1) else: cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \ outputs_info=T.zeros_like(sample_probs[:,0]), \ sequences=sample_probs[:,:-1].transpose()) cum_probs = T.clip(cum_probs.transpose(), 0., 1.-1e-5) # B x K-1 log_db_probs = T.log(sample_probs).sum(axis=1) - T.log(1.-cum_probs).sum(axis=1) # B log_db_probs = log_db_probs * db_index_switch # rl probs = L.get_output(self.network) # BH x A probs = _smooth(probs) out_probs = T.reshape(probs, (turn_mask.shape[0],turn_mask.shape[1],self.out_size)) # B x H x A log_probs = T.log(out_probs) act_probs = (log_probs*act_mask).sum(axis=2) # B x H ep_probs = (act_probs*turn_mask).sum(axis=1) # B H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B self.act_loss = -T.mean(ep_probs*reward_var) self.db_loss = -T.mean(log_db_probs*reward_var) self.reg_loss = -T.mean(ment*H_probs) self.loss = self.act_loss + self.db_loss + self.reg_loss self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \ pol_in] + hid_in_vars self.obj_fn = theano.function(self.inps, self.loss, on_unused_input='warn') self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \ [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn') self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss], on_unused_input='warn') self._rl_train_fn(self.learning_rate) ## sl sl_loss = 0. + bt_loss - T.mean(ep_probs) if self.sl=='e2e': sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) elif self.sl=='bel': sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) else: sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \ learning_rate=learning_rate_sl, epsilon=1e-4) sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates) sl_inps = [input_var, turn_mask, act_mask, pol_in] + p_targets + phi_targets + hid_in_vars self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \ on_unused_input='warn') self.sl_obj_fn = theano.function(sl_inps, sl_loss, on_unused_input='warn')
def __init__(self, ne, de, cs, nh, nc, L2_reg=0.0, rng=np.random.RandomState()): self.nc = nc self.hiddenLayer = Layer(de * cs, nh, rng=rng) self.outputLayer = Layer(nh, nc) self.emb = theano.shared( rng.normal(loc=0.0, scale=0.01, size=(ne, de)).astype(theano.config.floatX)) A = rng.normal(loc=0.0, scale=0.01, size=(nc, nc)).astype(theano.config.floatX) self.A = theano.shared(value=A, name='A', borrow=True) self.params = self.hiddenLayer.params + self.outputLayer.params + [ self.emb, self.A ] self.names = ['Wh', 'bh', 'w', 'b', 'emb', 'A'] idxs = T.imatrix('idxs') x = self.emb[idxs].reshape((idxs.shape[0], de * cs)) y = T.bvector('y') ans = T.bvector('ans') INF = 1e9 result, updates1 = theano.scan(fn=self.one_step, sequences=x, outputs_info=[ theano.shared(0.0), theano.shared(-INF), theano.shared(-INF), theano.shared(-INF), None, None, None, None ]) self.decode = theano.function(inputs=[idxs], outputs=result, updates=updates1) score, updates2 = theano.scan(fn=self.two_step, sequences=[ x, dict(input=y, taps=[-1, 0]), dict(input=ans, taps=[-1, 0]) ], outputs_info=theano.shared(0.0)) cost = score[-1] gradients = T.grad(cost, self.params) lr = T.scalar('lr') for p, g in zip(self.params, gradients): updates2[p] = p + lr * g self.fit = theano.function(inputs=[idxs, y, ans, lr], outputs=cost, updates=updates2) self.normalize = theano.function( inputs=[], updates={ self.emb: self.emb / T.sqrt( (self.emb**2).sum(axis=1)).dimshuffle(0, 'x') })
def inputs(self): return { "call_type": tensor.bvector("call_type"), "origin_call": tensor.ivector("origin_call"), "origin_stand": tensor.bvector("origin_stand"), "taxi_id": tensor.wvector("taxi_id"), "timestamp": tensor.ivector("timestamp"), "day_type": tensor.bvector("day_type"), "missing_data": tensor.bvector("missing_data"), "latitude": tensor.matrix("latitude"), "longitude": tensor.matrix("longitude"), "destination_latitude": tensor.vector("destination_latitude"), "destination_longitude": tensor.vector("destination_longitude"), "travel_time": tensor.ivector("travel_time"), "first_k_latitude": tensor.matrix("first_k_latitude"), "first_k_longitude": tensor.matrix("first_k_longitude"), "last_k_latitude": tensor.matrix("last_k_latitude"), "last_k_longitude": tensor.matrix("last_k_longitude"), "input_time": tensor.ivector("input_time"), "week_of_year": tensor.bvector("week_of_year"), "day_of_week": tensor.bvector("day_of_week"), "qhour_of_day": tensor.bvector("qhour_of_day"), "candidate_call_type": tensor.bvector("candidate_call_type"), "candidate_origin_call": tensor.ivector("candidate_origin_call"), "candidate_origin_stand": tensor.bvector("candidate_origin_stand"), "candidate_taxi_id": tensor.wvector("candidate_taxi_id"), "candidate_timestamp": tensor.ivector("candidate_timestamp"), "candidate_day_type": tensor.bvector("candidate_day_type"), "candidate_missing_data": tensor.bvector("candidate_missing_data"), "candidate_latitude": tensor.matrix("candidate_latitude"), "candidate_longitude": tensor.matrix("candidate_longitude"), "candidate_destination_latitude": tensor.vector("candidate_destination_latitude"), "candidate_destination_longitude": tensor.vector("candidate_destination_longitude"), "candidate_travel_time": tensor.ivector("candidate_travel_time"), "candidate_first_k_latitude": tensor.matrix("candidate_first_k_latitude"), "candidate_first_k_longitude": tensor.matrix("candidate_first_k_longitude"), "candidate_last_k_latitude": tensor.matrix("candidate_last_k_latitude"), "candidate_last_k_longitude": tensor.matrix("candidate_last_k_longitude"), "candidate_input_time": tensor.ivector("candidate_input_time"), "candidate_week_of_year": tensor.bvector("candidate_week_of_year"), "candidate_day_of_week": tensor.bvector("candidate_day_of_week"), "candidate_qhour_of_day": tensor.bvector("candidate_qhour_of_day"), }
def __init__(self, input, in_layer_shape, layer2_in = 1000, n_out = 11, use_adagrad = True, patch_size=64, activation=NeuralActivations.Rectifier, layer1_nout=11, exp_id=1, quiet=False, n_classes=11, save_file=None, mem_alloc="CPU", momentum=1., enable_standardization=False, rng=None): """ Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie. :type n_hidden: int :param n_hidden: number of hidden units :type in_layer_shape: list :param in_layer_shape: the shape of the first layer - format is : (no of patches, no of pixels per patch, no of batches, number of hidden units for locally connected hidden layer 1) :type layer2_in: list :param layer2_in: No of hidden units in the second hidden layer. :type shared_weights: use shared weights across the image :param shared_weights: boolean parameter to enable/disable the usage of shared weights :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie. """ self.input = input if rng == None: rng = numpy.random.RandomState(1234) self.monitor = Monitor() self.learning_rate = 0.001 self.exp_id = exp_id self.y = T.bvector('y') # the labels are presented as 1D vector of int32 self.mem_alloc = mem_alloc self.rng = rng self.ds = Dataset() self.n_out = n_out self.momentum = momentum self.save_file = save_file self.in_layer_shape = in_layer_shape self.layer2_in = layer2_in self.patch_size = patch_size self.layer1_nout = layer1_nout self.locally_connected_layer = None self.fully_connected_layer = None self.activation = activation self.n_hiddens_layer2 = (self.layer1_nout * in_layer_shape[0], layer2_in) self.n_classes = n_classes self.state = "train" self.enable_standardization = enable_standardization self.out_dir = "out/" self.grads = [] self.test_scores = [] #Whether to turn on or off the messages. self.quiet = quiet self.test_set_x = None self.valid_set_x = None self.test_set_y = None self.valid_set_y = None self.setup_hidden_layers(activation, in_layer_shape, self.n_hiddens_layer2, n_out) self.use_adagrad = use_adagrad #Error for patches with object in it: self.obj_patch_error_percent = 0
# the W matrix of the inputVectors as used in [1] targetEmbeddings = theano.shared( np.random.uniform(-1, 1, (vocabSize, embeddingSize))) # the W' matrix of the outputVectors as used in [1] contextEmbeddings = theano.shared( np.random.normal(scale=1.0 / np.sqrt(vocabSize), size=(embeddingSize, vocabSize))) # A |batchSize x 2| dimensional matrix, having (traget, context) pairs for # a batch (including) -ve samples. This is the input to the training function . targetContext = T.imatrix() # the |batchSize x 1| vector, trainig labels (also an input to the training # function), whether the context word matches the target word or not isContext = T.bvector() batchMatchScores = [] for i in range(batchSize): matchScore = T.dot(targetEmbeddings[targetContext[i][0], :], contextEmbeddings[:, targetContext[i][1]]) batchMatchScores.append(matchScore) objective = isContext * T.log(T.nnet.sigmoid(batchMatchScores)) + ( 1 - isContext) * T.log(1 - T.nnet.sigmoid(batchMatchScores)) loss = -T.mean(objective) # TRAINING FUNCTION from lasagne.updates import nesterov_momentum
inputs = [], outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)), updates = updates, givens = { X: data, Y: labels, } ) return train_model if __name__ == '__main__': print "Setting up memory..." X = T.bmatrix('X') Y = T.bvector('Y') Ws_char_to_hidden = [ U.create_shared(U.initial_weights(CHARACTERS,HIDDEN),name='yeah%d'%i) for i in xrange(CONTEXT) ] b_hidden = U.create_shared(U.initial_weights(HIDDEN)) W_hidden_to_hidden = U.create_shared(U.initial_weights(HIDDEN,HIDDEN)) W_hidden_to_predict = U.create_shared(U.initial_weights(HIDDEN,CHARACTERS)) b_predict = U.create_shared(U.initial_weights(CHARACTERS)) tunables = Ws_char_to_hidden + [ b_hidden, W_hidden_to_hidden, W_hidden_to_predict, b_predict ] print "Constructing graph..." hidden_inputs = make_hidden_inputs(X,Ws_char_to_hidden,b_hidden) hidden_outputs = make_hidden_outputs(hidden_inputs,W_hidden_to_hidden)
def policy_network(state): input_state = InputLayer(input_var=state, shape=(None, n_input)) dense_1 = DenseLayer(input_state, num_units=n_input, nonlinearity=tanh) dense_2 = DenseLayer(dense_1, num_units=n_input, nonlinearity=tanh) probs = DenseLayer(dense_2, num_units=n_output, nonlinearity=softmax) return probs X_state = T.fmatrix() X_action = T.bvector() X_reward = T.fvector() X_action_hot = to_one_hot(X_action, n_output) prob_values = policy_network(X_state) policy_ = get_output(prob_values) policy = theano.function(inputs=[X_state], outputs=policy_, allow_input_downcast=True) loss = categorical_crossentropy(policy_, X_action_hot) * X_reward loss = loss.mean() params = get_all_params(prob_values)
def __init__(self, n_hidden, embedding_dimention=50, feature_dimention=61): ##n_in: sequence lstm 的输入维度 ##n_hidden: lstm for candi and zp 的隐层维度 self.params = [] self.w_embedding = init_weight_file(args.embedding, args.embedding_dimention) self.params.append(self.w_embedding) self.zp_x_pre_index = T.imatrix("zp_x_pre") self.zp_x_post_index = T.imatrix("zp_x_post") zp_x_pre_newshape = (T.shape(self.zp_x_pre_index)[0], args.embedding_dimention) self.embedding_sub_zp_pre = self.w_embedding[ self.zp_x_pre_index.flatten()] self.zp_x_pre = T.reshape(self.embedding_sub_zp_pre, zp_x_pre_newshape) zp_x_post_newshape = (T.shape(self.zp_x_post_index)[0], args.embedding_dimention) self.embedding_sub_zp_post = self.w_embedding[ self.zp_x_post_index.flatten()] self.zp_x_post = T.reshape(self.embedding_sub_zp_post, zp_x_post_newshape) zp_nn_pre = LSTM(embedding_dimention, n_hidden, self.zp_x_pre) self.params += zp_nn_pre.params zp_nn_post = LSTM(embedding_dimention, n_hidden, self.zp_x_post) self.params += zp_nn_post.params danwei = theano.shared(np.eye(8, dtype=theano.config.floatX)) H_pre = zp_nn_pre.all_hidden H_post = zp_nn_post.all_hidden Ws1_pre, heihei = init_weight(n_hidden, n_hidden, pre="Ws1_pre_zp", ones=False) Ws2_pre, heihei = init_weight(8, n_hidden, pre="Ws2_pre_zp", ones=False) self.params += [Ws1_pre, Ws2_pre] A_pre = softmax(T.dot(Ws2_pre, T.dot(Ws1_pre, T.transpose(H_pre)))) P_pre = T.dot(A_pre, T.transpose(A_pre)) - danwei #norm_pre, _ = theano.scan(lambda i, tmp: T.dot(P_pre[i], P_pre[i]) + tmp, # sequences = T.arange(P_pre.shape[0]), # outputs_info = np.asarray(0., dtype=theano.config.floatX)) #f_norm_pre = T.sum(norm_pre[-1]) f_norm_pre = (P_pre**2).sum() zp_out_pre = T.mean(T.dot(A_pre, H_pre), axis=0) Ws1_post, heihei = init_weight(n_hidden, n_hidden, pre="Ws1_post_zp", ones=False) Ws2_post, heihei = init_weight(8, n_hidden, pre="Ws2_post_zp", ones=False) self.params += [Ws1_post, Ws2_post] A_post = softmax(T.dot(Ws2_post, T.dot(Ws1_post, T.transpose(H_post)))) P_post = T.dot(A_post, T.transpose(A_post)) - danwei #norm_post, _ = theano.scan(lambda i, tmp: T.dot(P_post[i], P_post[i]) + tmp, # sequences = T.arange(P_post.shape[0]), # outputs_info = np.asarray(0., dtype=theano.config.floatX)) #f_norm_post = T.sum(norm_post[-1]) f_norm_post = (P_post**2).sum() zp_out_post = T.mean(T.dot(A_post, H_post), axis=0) f_norm = f_norm_pre + f_norm_post #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out)) self.zp_out = T.concatenate((zp_out_pre, zp_out_post)) self.zp_out_output = self.zp_out ### get sequence output for NP ### self.np_x_post_index = T.itensor3("np_x") self.np_x_postc_index = T.itensor3("np_x") self.np_x_pre_index = T.itensor3("np_x") self.np_x_prec_index = T.itensor3("np_x") np_x_post_newshape = (T.shape(self.np_x_post_index)[0], T.shape(self.np_x_post_index)[1], args.embedding_dimention) self.embedding_sub_np_x_post = self.w_embedding[ self.np_x_post_index.flatten()] self.np_x_post = T.reshape(self.embedding_sub_np_x_post, np_x_post_newshape) np_x_postc_newshape = (T.shape(self.np_x_postc_index)[0], T.shape(self.np_x_postc_index)[1], args.embedding_dimention) self.embedding_sub_np_x_postc = self.w_embedding[ self.np_x_postc_index.flatten()] self.np_x_postc = T.reshape(self.embedding_sub_np_x_postc, np_x_postc_newshape) np_x_pre_newshape = (T.shape(self.np_x_pre_index)[0], T.shape(self.np_x_pre_index)[1], args.embedding_dimention) self.embedding_sub_np_x_pre = self.w_embedding[ self.np_x_pre_index.flatten()] self.np_x_pre = T.reshape(self.embedding_sub_np_x_pre, np_x_pre_newshape) np_x_prec_newshape = (T.shape(self.np_x_prec_index)[0], T.shape(self.np_x_prec_index)[1], args.embedding_dimention) self.embedding_sub_np_x_prec = self.w_embedding[ self.np_x_prec_index.flatten()] self.np_x_prec = T.reshape(self.embedding_sub_np_x_prec, np_x_prec_newshape) self.mask_pre = T.matrix("mask") self.mask_prec = T.matrix("mask") self.mask_post = T.matrix("mask") self.mask_postc = T.matrix("mask") self.np_nn_pre = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_pre, self.np_x_prec, self.mask_pre, self.mask_prec) self.params += self.np_nn_pre.params self.np_nn_post = sub_LSTM_batch(embedding_dimention, n_hidden, self.np_x_post, self.np_x_postc, self.mask_post, self.mask_postc) self.params += self.np_nn_post.params self.np_nn_post_output = self.np_nn_post.nn_out self.np_nn_pre_output = self.np_nn_pre.nn_out self.np_out = T.concatenate( (self.np_nn_post_output, self.np_nn_pre_output), axis=1) np_nn_f = LSTM(n_hidden * 2, n_hidden * 2, self.np_out) self.params += np_nn_f.params np_nn_b = LSTM(n_hidden * 2, n_hidden * 2, self.np_out[::-1]) self.params += np_nn_b.params self.bi_np_out = T.concatenate( (np_nn_f.all_hidden, np_nn_b.all_hidden[::-1]), axis=1) self.np_out_output = self.bi_np_out #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output]) #self.feature = T.matrix("feature") #self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active) #self.params += self.feature_layer.params w_attention_zp, b_attention = init_weight(n_hidden * 2, 1, pre="attention_zp", ones=False) self.params += [w_attention_zp, b_attention] w_attention_np, b_u = init_weight(n_hidden * 2, 1, pre="attention_np", ones=False) #self.params += [w_attention_np] w_attention_np_rnn, b_u = init_weight(n_hidden * 4, 1, pre="attention_np_rnn", ones=False) self.params += [w_attention_np_rnn] #np_out_dropout = _dropout_from_layer(self.np_out_output) #zp_out_dropout = _dropout_from_layer(self.zp_out_output) #np_dropout = _dropout_from_layer(self.np_out) #self.calcu_attention_dropout = tanh(T.dot(np_out_dropout,w_attention_np_rnn) + T.dot(zp_out_dropout,w_attention_zp) + T.dot(np_dropout,w_attention_np) + b_attention) #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention) self.calcu_attention = tanh( T.dot(self.np_out_output, w_attention_np_rnn) + T.dot(self.zp_out_output, w_attention_zp) + b_attention) self.attention = softmax(T.transpose(self.calcu_attention, axes=(1, 0)))[0] #self.attention_dropout = softmax(T.transpose(self.calcu_attention_dropout,axes=(1,0)))[0] self.out = self.attention #self.out_dropout = self.attention_dropout self.get_out = theano.function(inputs=[ self.zp_x_pre_index, self.zp_x_post_index, self.np_x_pre_index, self.np_x_prec_index, self.np_x_post_index, self.np_x_postc_index, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc ], outputs=[self.out], on_unused_input='warn') l1_norm_squared = sum([(w**2).sum() for w in self.params]) l2_norm_squared = sum([(abs(w)).sum() for w in self.params]) lmbda_l1 = 0.0 #lmbda_l2 = 0.001 lmbda_l2 = 0.0 t = T.bvector() cost = -(T.log((self.out * t).sum())) + f_norm #cost = -(T.log((self.out_dropout*t).sum())) lr = T.scalar() updates = lasagne.updates.sgd(cost, self.params, lr) #updates = lasagne.updates.adadelta(cost, self.params) self.train_step = theano.function(inputs=[ self.zp_x_pre_index, self.zp_x_post_index, self.np_x_pre_index, self.np_x_prec_index, self.np_x_post_index, self.np_x_postc_index, self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc, t, lr ], outputs=[cost], on_unused_input='warn', updates=updates)