Beispiel #1
0
def construct_network(context, characters, hidden, mult_hidden):
    print "Setting up memory..."
    X = T.bvector('X')
    Y = T.bvector('Y')
    alpha = T.cast(T.fscalar('alpha'), dtype=theano.config.floatX)
    lr = T.cast(T.fscalar('lr'), dtype=theano.config.floatX)

    print "Initialising weights..."
    W_char_hidden = U.create_shared(U.initial_weights(characters, hidden))
    f_char_hidden = U.create_shared(U.initial_weights(characters, mult_hidden))
    b_hidden = U.create_shared(U.initial_weights(hidden))
    Wf_hidden = U.create_shared(U.initial_weights(hidden, mult_hidden))
    fW_hidden = U.create_shared(U.initial_weights(mult_hidden, hidden))
    W_hidden_predict = U.create_shared(U.initial_weights(hidden, characters))
    b_predict = U.create_shared(U.initial_weights(characters))

    print "Constructing graph..."
    hidden = make_hidden(hidden, W_char_hidden[X], f_char_hidden[X], Wf_hidden,
                         fW_hidden, b_hidden)
    predictions = T.nnet.softmax(T.dot(hidden, W_hidden_predict) + b_predict)
    weights = [
        W_char_hidden, f_char_hidden, b_hidden, Wf_hidden, fW_hidden,
        W_hidden_predict, b_predict
    ]
    cost = -T.mean(T.log(predictions)[T.arange(Y.shape[0]), Y])
    gparams = T.grad(cost, weights)

    deltas = [U.create_shared(np.zeros(w.get_value().shape)) for w in weights]
    updates = [(param, param - (alpha * delta + gparam * lr))
               for param, delta, gparam in zip(weights, deltas, gparams)
               ] + [(delta, alpha * delta + gparam * lr)
                    for delta, gparam in zip(deltas, gparams)]
    return X, Y, alpha, lr, updates, predictions, weights
Beispiel #2
0
    def __init__(self, ne, de, cs, nh, nc, L2_reg = 0.0, rng = np.random.RandomState()):
	self.nc = nc
	self.hiddenLayer = Layer(de*cs, nh, rng = rng)
	self.outputLayer = Layer(nh, nc)
	self.emb = theano.shared(rng.normal(loc = 0.0, scale = 0.01, size = (ne, de)).astype(theano.config.floatX))
	A = rng.normal(loc = 0.0, scale = 0.01, size = (nc, nc)).astype(theano.config.floatX)
	self.A = theano.shared(value = A, name = 'A', borrow = True)

	self.params = self.hiddenLayer.params + self.outputLayer.params + [self.emb, self.A]
	self.names = ['Wh', 'bh', 'w', 'b', 'emb', 'A']

	idxs = T.imatrix('idxs')
	x = self.emb[idxs].reshape((idxs.shape[0], de*cs))
	y = T.bvector('y')
	ans = T.bvector('ans')

	INF = 1e9
	result, updates1 = theano.scan(fn = self.one_step, sequences = x, outputs_info = [theano.shared(0.0), theano.shared(-INF), theano.shared(-INF), theano.shared(-INF), None, None, None, None])
	self.decode = theano.function(inputs = [idxs], outputs = result, updates = updates1)

	score, updates2 = theano.scan(fn = self.two_step, sequences = [x, dict(input = y, taps = [-1, 0]), dict(input = ans, taps = [-1, 0])], outputs_info = theano.shared(0.0))

	cost = score[-1]
	gradients = T.grad(cost, self.params)
	lr = T.scalar('lr')
	for p, g in zip(self.params, gradients):
	    updates2[p] = p + lr * g

	self.fit = theano.function(inputs = [idxs, y, ans, lr], outputs = cost, updates = updates2)
	self.normalize = theano.function(inputs = [], updates = {self.emb: self.emb / T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0, 'x')})
Beispiel #3
0
    def test_param_allow_downcast_int(self):
        a = tensor.wvector('a')  # int16
        b = tensor.bvector('b')  # int8
        c = tensor.bscalar('c')  # int8
        f = pfunc([
            Param(a, allow_downcast=True),
            Param(b, allow_downcast=False),
            Param(c, allow_downcast=None)
        ], (a + b + c))

        # Both values are in range. Since they're not ndarrays (but lists),
        # they will be converted, and their value checked.
        assert numpy.all(f([3], [6], 1) == 10)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'),
                          1)

        # Value too big for a, silently ignored
        assert numpy.all(f([2**20], numpy.ones(1, dtype='int8'), 1) == 2)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, f, [3], [312], 1)

        # Value too big for c, raises TypeError
        self.assertRaises(TypeError, f, [3], [6], 806)
Beispiel #4
0
    def test_param_allow_downcast_int(self):
        a = tensor.wvector("a")  # int16
        b = tensor.bvector("b")  # int8
        c = tensor.bscalar("c")  # int8
        f = pfunc(
            [
                In(a, allow_downcast=True),
                In(b, allow_downcast=False),
                In(c, allow_downcast=None),
            ],
            (a + b + c),
        )

        # Both values are in range. Since they're not ndarrays (but lists),
        # they will be converted, and their value checked.
        assert np.all(f([3], [6], 1) == 10)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        with pytest.raises(TypeError):
            f([3], np.array([6], dtype="int16"), 1)

        # Value too big for a, silently ignored
        assert np.all(f([2**20], np.ones(1, dtype="int8"), 1) == 2)

        # Value too big for b, raises TypeError
        with pytest.raises(TypeError):
            f([3], [312], 1)

        # Value too big for c, raises TypeError
        with pytest.raises(TypeError):
            f([3], [6], 806)
Beispiel #5
0
    def test_allow_input_downcast_int(self):
        a = tensor.wvector("a")  # int16
        b = tensor.bvector("b")  # int8
        c = tensor.bscalar("c")  # int8

        f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True)
        # Value too big for a, b, or c, silently ignored
        assert f([2**20], [1], 0) == 1
        assert f([3], [312], 0) == 59
        assert f([3], [1], 806) == 42

        g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False)
        # All values are in range. Since they're not ndarrays (but lists
        # or scalars), they will be converted, and their value checked.
        assert np.all(g([3], [6], 0) == 9)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        with pytest.raises(TypeError):
            g([3], np.array([6], dtype="int16"), 0)

        # Value too big for b, raises TypeError
        with pytest.raises(TypeError):
            g([3], [312], 0)

        h = pfunc([a, b, c], (a + b + c))  # Default: allow_input_downcast=None
        # Everything here should behave like with False
        assert np.all(h([3], [6], 0) == 9)
        with pytest.raises(TypeError):
            h([3], np.array([6], dtype="int16"), 0)
        with pytest.raises(TypeError):
            h([3], [312], 0)
Beispiel #6
0
    def build_loss(self, env_spec, policy):
        obs = env_spec.observation_space.new_tensor_variable('obs', extra_dims=1)
        next_obs = env_spec.observation_space.new_tensor_variable('next_obs', extra_dims=1)
        act = env_spec.action_space.new_tensor_variable('act', extra_dims=1)
        ret = T.vector('disc_n_return')
        term = T.bvector('terminal')
        if self.prioritized_replay:
            isw = T.vector('importance_sample_weights')

        if self.double_dqn:
            next_a = policy.actions_sym(next_obs)
            next_q = policy.target_q_at_a_sym(next_obs, next_a)
        else:
            next_q = policy.target_max_q_sym(next_obs)

        disc_next_q = (self.discount ** self.reward_horizon) * next_q
        y = ret + (1 - term) * disc_next_q
        q = policy.q_at_a_sym(obs, act)
        d = y - q
        losses = 0.5 * d ** 2
        if self.delta_clip is not None:
            # Huber loss:
            b = self.delta_clip * (abs(d) - self.delta_clip / 2)
            losses = T.switch(abs(d) <= self.delta_clip, losses, b)
        if self.prioritized_replay:
            losses = isw * losses
        loss = T.mean(losses)

        td_abs_errors = T.clip(abs(d), 0, self.delta_clip)

        input_list = [obs, next_obs, act, ret, term]
        if self.prioritized_replay:
            input_list.append(isw)

        return input_list, loss, td_abs_errors
Beispiel #7
0
    def test_param_allow_downcast_int(self):
        a = tensor.wvector('a')  # int16
        b = tensor.bvector('b')  # int8
        c = tensor.bscalar('c')  # int8
        f = pfunc([Param(a, allow_downcast=True),
                   Param(b, allow_downcast=False),
                   Param(c, allow_downcast=None)],
                  (a + b + c))

        # Both values are in range. Since they're not ndarrays (but lists),
        # they will be converted, and their value checked.
        assert numpy.all(f([3], [6], 1) == 10)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        self.assertRaises(TypeError, f,
                [3], numpy.array([6], dtype='int16'), 1)

        # Value too big for a, silently ignored
        assert numpy.all(f([2 ** 20], numpy.ones(1, dtype='int8'), 1) == 2)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, f, [3], [312], 1)

        # Value too big for c, raises TypeError
        self.assertRaises(TypeError, f, [3], [6], 806)
Beispiel #8
0
    def test_allow_input_downcast_int(self):
        a = tensor.wvector('a')  # int16
        b = tensor.bvector('b')  # int8
        c = tensor.bscalar('c')  # int8

        f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True)
        # Value too big for a, b, or c, silently ignored
        assert f([2 ** 20], [1], 0) == 1
        assert f([3], [312], 0) == 59
        assert f([3], [1], 806) == 42

        g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False)
        # All values are in range. Since they're not ndarrays (but lists
        # or scalars), they will be converted, and their value checked.
        assert numpy.all(g([3], [6], 0) == 9)

        # Values are in range, but a dtype too large has explicitly been given
        # For performance reasons, no check of the data is explicitly performed
        # (It might be OK to change this in the future.)
        self.assertRaises(TypeError, g,
                [3], numpy.array([6], dtype='int16'), 0)

        # Value too big for b, raises TypeError
        self.assertRaises(TypeError, g, [3], [312], 0)

        h = pfunc([a, b, c], (a + b + c))  # Default: allow_input_downcast=None
        # Everything here should behave like with False
        assert numpy.all(h([3], [6], 0) == 9)
        self.assertRaises(TypeError, h,
                [3], numpy.array([6], dtype='int16'), 0)
        self.assertRaises(TypeError, h, [3], [312], 0)
def construct_network(context,characters,hidden,mult_hidden):
	print "Setting up memory..."
	X = T.bvector('X')
	Y = T.bvector('Y')
	alpha = T.cast(T.fscalar('alpha'),dtype=theano.config.floatX)
	lr    = T.cast(T.fscalar('lr'),   dtype=theano.config.floatX)
	
	print "Initialising weights..."
	W_char_hidden    = U.create_shared(U.initial_weights(characters,hidden))
	f_char_hidden    = U.create_shared(U.initial_weights(characters,mult_hidden))
	b_hidden         = U.create_shared(U.initial_weights(hidden))
	Wf_hidden        = U.create_shared(U.initial_weights(hidden,mult_hidden))
	fW_hidden        = U.create_shared(U.initial_weights(mult_hidden,hidden))
	W_hidden_predict = U.create_shared(U.initial_weights(hidden,characters))
	b_predict        = U.create_shared(U.initial_weights(characters))

	print "Constructing graph..."
	hidden = make_hidden(
			hidden,
			W_char_hidden[X],
			f_char_hidden[X],
			Wf_hidden,
			fW_hidden,
			b_hidden
		)
	predictions = T.nnet.softmax(T.dot(hidden,W_hidden_predict) + b_predict)
	weights = [
			W_char_hidden,
			f_char_hidden,
			b_hidden,
			Wf_hidden,
			fW_hidden,
			W_hidden_predict,
			b_predict
		]
	cost    = -T.mean(T.log(predictions)[T.arange(Y.shape[0]),Y])
	gparams =  T.grad(cost,weights)

	deltas  = [ U.create_shared(np.zeros(w.get_value().shape)) for w in weights ]
	updates = [
				( param, param - ( alpha * delta + gparam * lr ) )
					for param,delta,gparam in zip(weights,deltas,gparams)
			] + [
				( delta, alpha * delta + gparam * lr)
					for delta,gparam in zip(deltas,gparams)
			]
	return X,Y,alpha,lr,updates,predictions,weights
    def __init__(self,
                 state_format,
                 actions_number,
                 gamma=0.99,
                 learning_rate=0.00025,
                 ddqn=False,
                 **kwargs):
        self.inputs = dict()
        self.learning_rate = learning_rate
        architecture = kwargs

        self.loss_history = []
        self.misc_state_included = (state_format["s_misc"] > 0)
        self.gamma = np.float64(gamma)

        self.inputs["S0"] = tensor.tensor4("S0")
        self.inputs["S1"] = tensor.tensor4("S1")
        self.inputs["A"] = tensor.ivector("Action")
        self.inputs["R"] = tensor.vector("Reward")
        self.inputs["Nonterminal"] = tensor.bvector("Nonterminal")
        if self.misc_state_included:
            self.inputs["S0_misc"] = tensor.matrix("S0_misc")
            self.inputs["S1_misc"] = tensor.matrix("S1_misc")
            self.misc_len = state_format["s_misc"]
        else:
            self.misc_len = None

        # save it for the evaluation reshape
        # TODO get rid of this?
        self.single_image_input_shape = (1, ) + tuple(state_format["s_img"])

        architecture["img_input_shape"] = (None, ) + tuple(
            state_format["s_img"])
        architecture["misc_len"] = self.misc_len
        architecture["output_size"] = actions_number

        if self.misc_state_included:
            self.network, input_layers, _ = self._initialize_network(
                img_input=self.inputs["S0"],
                misc_input=self.inputs["S0_misc"],
                **architecture)
            self.frozen_network, _, alternate_inputs = self._initialize_network(
                img_input=self.inputs["S1"],
                misc_input=self.inputs["S1_misc"],
                **architecture)
        else:

            self.network, input_layers, _ = self._initialize_network(
                img_input=self.inputs["S0"], **architecture)
            self.frozen_network, _, alternate_inputs = self._initialize_network(
                img_input=self.inputs["S1"], **architecture)

        self.alternate_input_mappings = {}
        for layer, input in zip(input_layers, alternate_inputs):
            self.alternate_input_mappings[layer] = input

        # print "Network initialized."
        self._compile(ddqn)
    def __init__(self, param_dict):

        self.param_dict = param_dict
        self.training_batch_size = param_dict['training_batch_size']
        nkerns = param_dict['nkerns']
        recept_width = param_dict['recept_width']
        pool_width = param_dict['pool_width']
        stride = param_dict['stride']
        dropout_prob = param_dict['dropout_prob']
        weight_decay = param_dict['l2_reg']
        activation = param_dict['activation']
        weights_variance = param_dict['weights_variance']
        n_channels = param_dict['n_channels']
        n_timesteps = param_dict['n_timesteps']
        n_fbins = param_dict['n_fbins']
        global_pooling = param_dict['global_pooling']
        rng = np.random.RandomState(23455)

        self.training_mode = T.iscalar('training_mode')
        self.x = T.tensor4('x')
        self.y = T.bvector('y')
        self.batch_size = theano.shared(self.training_batch_size)

        self.input = self.x.reshape((self.batch_size, 1, n_channels * n_fbins, n_timesteps))

        self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride,
                                                  self.training_mode,
                                                  dropout_prob[0],
                                                  activation, weights_variance, n_channels, n_timesteps, n_fbins,
                                                  global_pooling)

        self.classifier = SoftmaxLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1],
                                       training_mode=self.training_mode, dropout_prob=dropout_prob[-1])

        self.weights = self.feature_extractor.weights + self.classifier.weights

        # ---------------------- BACKPROP
        self.cost = self.classifier.cross_entropy_cost(self.y)
        self.cost = self.classifier.cross_entropy_cost(self.y)
        L2_sqr = sum((weight ** 2).sum() for weight in self.weights[::2])
        self.grads = T.grad(self.cost + weight_decay * L2_sqr, self.weights)
        self.updates = self.adadelta_updates(self.grads, self.weights)
        # self.updates = self.nesterov_momentum(self.grads, self.weights)

        # --------------------- FUNCTIONS
        self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)],
                                           outputs=self.cost,
                                           updates=self.updates)

        self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)],
                                              self.cost)

        self.test_model = theano.function([self.x, Param(self.training_mode, default=0)],
                                          self.classifier.p_y_given_x[:, 1])
Beispiel #12
0
    def __init__(self, nkerns, recept_width, pool_width,
                 dropout_prob, training_batch_size, activation, n_timesteps=1000, dim=18):

        if activation == 'tanh':
            activation_function = lambda x: T.tanh(x)
        elif activation == 'relu':
            activation_function = lambda x: T.maximum(0.0, x)
        else:
            raise ValueError('unknown activation function')

        self.training_batch_size = training_batch_size

        rng = np.random.RandomState(23455)

        self.training_mode = T.iscalar('training_mode')
        self.x = T.matrix('x')
        self.y = T.bvector('y')
        self.batch_size = theano.shared(self.training_batch_size)

        # 18@1*1000
        self.layer0_input = self.x.reshape((self.batch_size, dim, 1, n_timesteps))

        # image 18 @ 1*1000
        # c1: nkerns[0] @ 1* (1000 - recept_width[0] + 1)
        # s2: nkerns[0] @ 1 * c1 / pool_width[0]
        layer0 = ConvPoolLayer(rng, input=self.layer0_input,
                               image_shape=(None, dim, 1, n_timesteps),
                               filter_shape=(nkerns[0], dim, 1, recept_width[0]),
                               poolsize=(1, pool_width[0]), activation_function=activation_function)


        # c3: nkerns[1] @ 1 * (s2 - recept_width[1] + 1)
        # s4  nkerns[1] @ 1 *  c3 / pool_width
        input_layer1_width = (n_timesteps - recept_width[0] + 1) / pool_width[0]
        layer1 = ConvPoolLayer(rng, input=layer0.output,
                               image_shape=(None, nkerns[0], 1, input_layer1_width),
                               filter_shape=(nkerns[1], nkerns[0], 1, recept_width[1]),
                               poolsize=(1, pool_width[1]), activation_function=activation_function)

        # s4:(batch_size, nkerns[1], 1, s4) -> flatten(2) -> (batch_size, nkerns[1]* 1 * s4)
        layer2_input = layer1.output.flatten(2)

        input_layer2_size = (input_layer1_width - recept_width[1] + 1) / pool_width[1]
        # c5: 120@1*1
        self.layer2 = HiddenLayer(rng=rng, input=layer2_input,
                                  n_in=nkerns[1] * 1 * input_layer2_size, n_out=nkerns[2],
                                  training_mode=self.training_mode,
                                  dropout_prob=dropout_prob, activation_function=activation_function)
        # f6/output
        self.layer3 = LogisticRegressionLayer(input=self.layer2.output, n_in=nkerns[2], n_out=2,
                                              training_mode=self.training_mode, dropout_prob=dropout_prob)

        self.params = self.layer3.params + self.layer2.params + layer1.params + layer0.params
Beispiel #13
0
    def __init__(self, rng, hiddenLayerList, n_out):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type hiddenLayerList: [HiddenLayer instances]
        :param hiddenLayerList: A list of hidden layers

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """
        # connect hidden layers (no need to, they're already connected outside when building them)
        self.hiddenLayers=hiddenLayerList
        # prevLy=hiddenLayerList[0]
        # prevLy.input=input
        # for ly in hiddenLayerList[1:]:
        #     ly.input=prevLy.output
        #     prevLy=ly

        # The logistic regression layer gets as input the hidden units of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=hiddenLayerList[-1].output,
            n_in=hiddenLayerList[-1].inOutDim[1],
            n_out=n_out)

        # symbolic variables for data
        self.X=self.hiddenLayers[0].input # training data
        self.y=T.bvector('y') # labels for training data

        # L1 norm ; one regularization option is to enforce L1 norm to be small
        self.L1 = abs(self.logRegressionLayer.W).sum()
        for ly in self.hiddenLayers:
            self.L1 += abs(ly.W).sum()

        # square of L2 norm ; one regularization option is to enforce square of L2 norm to be small
        self.L2_sqr = (self.logRegressionLayer.W ** 2).sum()
        for ly in self.hiddenLayers:            
            self.L2_sqr += (ly.W ** 2).sum()

        # negative log likelihood of the MLP is given by the negative log likelihood of the output
        # of the model, computed in the logistic regression layer
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        # the parameters of the model are the parameters of the all layers
        self.params=self.logRegressionLayer.params
        for ly in self.hiddenLayers:
            self.params+=ly.params
    def __init__(self, nkerns, recept_width, pool_width, stride, dropout_prob, l2_reg, training_batch_size, activation,
                 weights_variance, n_timesteps,
                 dim, objective_function):

        self.training_batch_size = training_batch_size
        self.objective_function = objective_function

        rng = np.random.RandomState(23455)

        self.training_mode = T.iscalar('training_mode')
        self.x = T.matrix('x')
        self.y = T.bvector('y')
        self.batch_size = theano.shared(training_batch_size)

        self.input = self.x.reshape((self.batch_size, 1, dim, n_timesteps))

        self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride,
                                                  self.training_mode,
                                                  dropout_prob[0],
                                                  activation, weights_variance, n_timesteps, dim)

        self.classifier = LogisticRegressionLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1],
                                                  training_mode=self.training_mode, dropout_prob=dropout_prob[1])

        self.params = self.feature_extractor.params + self.classifier.params

        # ---------------------- BACKPROP

        if self.objective_function == 'cross_entropy':
            self.cost = self.classifier.cross_entropy_cost(self.y)
        elif self.objective_function == 'auc':
            self.cost = self.classifier.auc_cost(self.y)
        else:
            raise ValueError('wrong objective function')

        L2_sqr = sum((param ** 2).sum() for param in self.params[::2])
        self.grads = T.grad(self.cost + l2_reg * L2_sqr, self.params)
        self.updates = self._adadelta_updates(self.grads)

        # --------------------- FUNCTIONS
        tp, tn, fp, fn = self.classifier.confusion_matrix(self.y)

        self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)],
                                           updates=self.updates)

        self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)],
                                              [self.cost, tp, tn, fp, fn])

        self.test_model = theano.function([self.x, Param(self.training_mode, default=0)],
                                          self.classifier.p_y_given_x.flatten())
Beispiel #15
0
def make_train_functions():
    P = Parameters()
    X = T.bvector('X')
    Y = T.ivector('Y')
    aux = {}

    predict = model.build(
        P,
        input_size=128,
        embedding_size=64,
        controller_size=256,
        stack_size=256,
        output_size=128,
    )

    output = predict(X,aux=aux)
    error = - T.log(output[T.arange(Y.shape[0]),((128+1 + Y)%(128+1))])
    error = error[-(Y.shape[0]/2):]
    parameters = P.values()
    gradients = T.grad(T.sum(error),wrt=parameters)
    shapes = [ p.get_value().shape for p in parameters ]
    count = theano.shared(np.float32(0))
    acc_grads  = [
        theano.shared(np.zeros(s,dtype=np.float32))
        for s in shapes
    ]

    acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\
                 [ (count,count + np.float32(1)) ]
    acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\
                [ (count,np.int32(0)) ]
    avg_grads = [ (g / count) for g in acc_grads ]
    avg_grads = [ clip(g,1) for g in acc_grads ]


    acc = theano.function(
            inputs=[X,Y],
            outputs=T.mean(error),
            updates = acc_update,
        )
    update = theano.function(
            inputs=[],
            updates=updates.adadelta(parameters,avg_grads,learning_rate=1e-8) + acc_clear
        )

    test = theano.function(
            inputs=[X],
            outputs=T.argmax(output,axis=1)[-(X.shape[0]/2):],
        )
    return acc,update,test
Beispiel #16
0
    def __init__(self, game_params, arch_params, solver_params, trained_model, sn_dir):

        params=None

        if trained_model:
            params = common.load_params(trained_model)

        self.lr_func = create_learning_rate_func(solver_params)

        self.x_h_0 = tt.fvector('x_h_0')
        self.v_h_0 = tt.fvector('v_h_0')
        self.t_h_0 = tt.fvector('t_h_0')
        self.x_t_0 = tt.fmatrix('x_t_0')
        self.v_t_0 = tt.fmatrix('v_t_0')
        self.a_t_0 = tt.fmatrix('a_t_0')
        self.t_t_0 = tt.fvector('t_t_0')
        self.time_steps = tt.fvector('t_0')
        self.exist = tt.bvector('exist')
        self.is_leader = tt.fvector('is_leader')
        self.x_goal = tt.fvector('x_goal')
        self.turn_vec_h = tt.fvector('turn_vec_h')
        self.turn_vec_t = tt.fvector('turn_vec_t')
        self.n_steps = tt.iscalar('n_steps')
        self.lr = tt.fscalar('lr')
        self.sn_dir = sn_dir
        self.game_params = game_params
        self.arch_params = arch_params
        self.solver_params = solver_params

        self.model = CONTROLLER(self.x_h_0,
                                self.v_h_0,
                                self.t_h_0,
                                self.x_t_0,
                                self.v_t_0,
                                self.a_t_0,
                                self.t_t_0,
                                self.time_steps,
                                self.exist,
                                self.is_leader,
                                self.x_goal,
                                self.turn_vec_h,
                                self.turn_vec_t,
                                self.n_steps,
                                self.lr,
                                self.game_params,
                                self.arch_params,
                                self.solver_params,
                                params)
Beispiel #17
0
    def __init__(self, state_format, actions_number, gamma=0.99, learning_rate=0.00025, ddqn=False, **kwargs):
        self.inputs = dict()
        self.learning_rate = learning_rate
        architecture = kwargs

        self.loss_history = []
        self.misc_state_included = (state_format["s_misc"] > 0)
        self.gamma = np.float64(gamma)

        self.inputs["S0"] = tensor.tensor4("S0")
        self.inputs["S1"] = tensor.tensor4("S1")
        self.inputs["A"] = tensor.ivector("Action")
        self.inputs["R"] = tensor.vector("Reward")
        self.inputs["Nonterminal"] = tensor.bvector("Nonterminal")
        if self.misc_state_included:
            self.inputs["S0_misc"] = tensor.matrix("S0_misc")
            self.inputs["S1_misc"] = tensor.matrix("S1_misc")
            self.misc_len = state_format["s_misc"]
        else:
            self.misc_len = None

        # save it for the evaluation reshape
        # TODO get rid of this?
        self.single_image_input_shape = (1,) + tuple(state_format["s_img"])

        architecture["img_input_shape"] = (None,) + tuple(state_format["s_img"])
        architecture["misc_len"] = self.misc_len
        architecture["output_size"] = actions_number

        if self.misc_state_included:
            self.network, input_layers, _ = self._initialize_network(img_input=self.inputs["S0"],
                                                                     misc_input=self.inputs["S0_misc"],
                                                                     **architecture)
            self.frozen_network, _, alternate_inputs = self._initialize_network(img_input=self.inputs["S1"],
                                                                                misc_input=self.inputs["S1_misc"],
                                                                                **architecture)
        else:

            self.network, input_layers, _ = self._initialize_network(img_input=self.inputs["S0"], **architecture)
            self.frozen_network, _, alternate_inputs = self._initialize_network(img_input=self.inputs["S1"],
                                                                                **architecture)

        self.alternate_input_mappings = {}
        for layer, input in zip(input_layers, alternate_inputs):
            self.alternate_input_mappings[layer] = input

        # print "Network initialized."
        self._compile(ddqn)
Beispiel #18
0
def make_train_functions():
    P = Parameters()
    X = T.bvector('X')
    Y = T.ivector('Y')
    aux = {}

    predict = model.build(
        P,
        input_size=128,
        embedding_size=64,
        controller_size=256,
        stack_size=256,
        output_size=128,
    )

    output = predict(X, aux=aux)
    error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))])
    error = error[-(Y.shape[0] / 2):]
    parameters = P.values()
    gradients = T.grad(T.sum(error), wrt=parameters)
    shapes = [p.get_value().shape for p in parameters]
    count = theano.shared(np.float32(0))
    acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes]

    acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\
                 [ (count,count + np.float32(1)) ]
    acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\
                [ (count,np.int32(0)) ]
    avg_grads = [(g / count) for g in acc_grads]
    avg_grads = [clip(g, 1) for g in acc_grads]

    acc = theano.function(
        inputs=[X, Y],
        outputs=T.mean(error),
        updates=acc_update,
    )
    update = theano.function(
        inputs=[],
        updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) +
        acc_clear)

    test = theano.function(
        inputs=[X],
        outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):],
    )
    return acc, update, test
Beispiel #19
0
 def inputs(self):
     return {'call_type': tensor.bvector('call_type'),
             'origin_call': tensor.ivector('origin_call'),
             'origin_stand': tensor.bvector('origin_stand'),
             'taxi_id': tensor.wvector('taxi_id'),
             'timestamp': tensor.ivector('timestamp'),
             'day_type': tensor.bvector('day_type'),
             'missing_data': tensor.bvector('missing_data'),
             'latitude': tensor.matrix('latitude'),
             'longitude': tensor.matrix('longitude'),
             'latitude_mask': tensor.matrix('latitude_mask'),
             'longitude_mask': tensor.matrix('longitude_mask'),
             'week_of_year': tensor.bvector('week_of_year'),
             'day_of_week': tensor.bvector('day_of_week'),
             'qhour_of_day': tensor.bvector('qhour_of_day'),
             'destination_latitude': tensor.vector('destination_latitude'),
             'destination_longitude': tensor.vector('destination_longitude')}
Beispiel #20
0
 def inputs(self):
     return {
         'call_type': tensor.bvector('call_type'),
         'origin_call': tensor.ivector('origin_call'),
         'origin_stand': tensor.bvector('origin_stand'),
         'taxi_id': tensor.wvector('taxi_id'),
         'timestamp': tensor.ivector('timestamp'),
         'day_type': tensor.bvector('day_type'),
         'missing_data': tensor.bvector('missing_data'),
         'latitude': tensor.matrix('latitude'),
         'longitude': tensor.matrix('longitude'),
         'latitude_mask': tensor.matrix('latitude_mask'),
         'longitude_mask': tensor.matrix('longitude_mask'),
         'week_of_year': tensor.bvector('week_of_year'),
         'day_of_week': tensor.bvector('day_of_week'),
         'qhour_of_day': tensor.bvector('qhour_of_day'),
         'destination_latitude': tensor.vector('destination_latitude'),
         'destination_longitude': tensor.vector('destination_longitude')
     }
    def __init__(self, nkerns, recept_width, pool_width, stride, dropout_prob, l2_reg, training_batch_size, activation,
                 weights_variance, n_timesteps, dim):

        self.training_batch_size = training_batch_size
        rng = np.random.RandomState(23455)

        self.training_mode = T.iscalar('training_mode')
        self.x = T.matrix('x')
        self.y = T.bvector('y')
        self.batch_size = theano.shared(training_batch_size)

        self.input = self.x.reshape((self.batch_size, 1, dim, n_timesteps))

        self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride,
                                                  self.training_mode,
                                                  dropout_prob[0],
                                                  activation, weights_variance, n_timesteps, dim)

        self.classifier = SoftmaxLayer(rng=rng, input=self.feature_extractor.output,
                                       n_in=nkerns[-1], n_out=3,
                                       training_mode=self.training_mode, dropout_prob=dropout_prob[1])

        self.params = self.feature_extractor.params + self.classifier.params

        # ---------------------- BACKPROP
        self.cost = self.classifier.cross_entropy_cost(self.y)
        L2_sqr = sum((param ** 2).sum() for param in self.params[::2])
        self.grads = T.grad(self.cost + l2_reg * L2_sqr, self.params)
        self.updates = self._adadelta_updates(self.grads)

        # --------------------- FUNCTIONS
        self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)],
                                           self.cost,
                                           updates=self.updates)

        self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)],
                                              self.cost)

        self.test_model = theano.function([self.x, Param(self.training_mode, default=0)],
                                          self.classifier.p_y_given_x)
Beispiel #22
0
def GetProbFunctions(num_features, learning_rate=1e-4, ret_updates=True):
    adjustment_var = T.bmatrix(name='Adjustment matrix')
    features_var = T.fmatrix(name='Features')
    mask_var = T.bvector(name='Filter mask')
    reward_var = T.scalar(name='Reward')
    net = BuildGraphNetwork(adjustment_var, features_var, mask_var,
                            num_features)
    desc = lasagne.layers.get_output(net['desc'])
    prob = msoftmax(theano.gradient.grad_clip(desc, -1, 1))
    reward_grad = reward_var / prob
    params = lasagne.layers.get_all_params(net['desc'], trainable=True)
    grads = theano.grad(None, params, known_grads={prob: reward_grad})
    updates = lasagne.updates.momentum(grads,
                                       params,
                                       learning_rate=learning_rate)
    action_fn = theano.function([adjustment_var, features_var, mask_var], prob)
    if ret_updates:
        updates_fn = theano.function(
            [adjustment_var, features_var, mask_var, reward_var], [],
            updates=updates,
            allow_input_downcast=True)
        return net, action_fn, updates_fn
    else:
        return net, action_fn
Beispiel #23
0
    def __init__(self, game_params, arch_params, solver_params, trained_model,
                 sn_dir):

        params = None

        if trained_model:
            params = common.load_params(trained_model)

        self.lr_func = create_learning_rate_func(solver_params)

        self.x_h_0 = tt.fvector('x_h_0')
        self.v_h_0 = tt.fvector('v_h_0')
        self.t_h_0 = tt.fvector('t_h_0')
        self.x_t_0 = tt.fmatrix('x_t_0')
        self.v_t_0 = tt.fmatrix('v_t_0')
        self.a_t_0 = tt.fmatrix('a_t_0')
        self.t_t_0 = tt.fvector('t_t_0')
        self.time_steps = tt.fvector('t_0')
        self.exist = tt.bvector('exist')
        self.is_leader = tt.fvector('is_leader')
        self.x_goal = tt.fvector('x_goal')
        self.turn_vec_h = tt.fvector('turn_vec_h')
        self.turn_vec_t = tt.fvector('turn_vec_t')
        self.n_steps = tt.iscalar('n_steps')
        self.lr = tt.fscalar('lr')
        self.sn_dir = sn_dir
        self.game_params = game_params
        self.arch_params = arch_params
        self.solver_params = solver_params

        self.model = CONTROLLER(self.x_h_0, self.v_h_0, self.t_h_0, self.x_t_0,
                                self.v_t_0, self.a_t_0, self.t_t_0,
                                self.time_steps, self.exist, self.is_leader,
                                self.x_goal, self.turn_vec_h, self.turn_vec_t,
                                self.n_steps, self.lr, self.game_params,
                                self.arch_params, self.solver_params, params)
Beispiel #24
0
    def __init__(self, rng, n_in, n_out, n_h, n_layers, f_act=leaky_relu, obj='single', dropout_rate = 0):
        '''
        :param rng: Numpy RandomState
        :param n_in: Input dimension (int)
        :param n_out: Output dimension (int)
        :param n_h: Hidden dimension (int)
        :param n_layers: Number of hidden layers (int)
        :param f_act: Hidden-to-hidden activation function
        :param f_out: Output activation function
        '''
        if obj=='single':
            f_out = softmax
        elif obj=='multi':
            f_out = sigmoid
        self.x = T.vector()

        # construct hidden layers
        assert(n_layers>=1)
        first_hiddenLayer = HiddenLayer(
            rng=rng,
            input=self.x,
            predict_input=self.x,
            n_in=n_in,
            n_out=n_h,
            activation=f_act,
            dropout_rate = dropout_rate,
            nametag='0'
        )

        self.hidden_layers = [first_hiddenLayer]
        self.p = first_hiddenLayer.params[:]

        for i in range(n_layers-1):
            cur_hiddenLayer = ResNetLayer(
                rng=rng,
                input=self.hidden_layers[-1].output,
                predict_input=self.hidden_layers[-1].predict_output,
                n_h=n_h,
                activation=f_act,
                dropout_rate = dropout_rate,
                nametag=str(i+1)
                )
            self.hidden_layers.append(cur_hiddenLayer)
            self.p.extend(cur_hiddenLayer.params[:])

        # params for output layer

        self.outputLayer = HiddenLayer(
            rng=rng,
            input=self.hidden_layers[-1].output,
            predict_input=self.hidden_layers[-1].predict_output,
            n_in=n_h,
            n_out=n_out,
            activation=f_out,
            dropout_rate = 0,
            nametag='o'
        )
        self.p.extend(self.outputLayer.params[:])

        self.n_layers = n_layers + 1
        self.obj = obj
        if obj=='single':
            self.y = T.bscalar('y')
            self.o = self.outputLayer.output
            self.cost = T.nnet.categorical_crossentropy(self.o, T.eye(n_out)[self.y])
            self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.)
            self.prediction = np.argmax(self.o)
        elif obj=='multi':
            self.y = T.bvector('y')
            self.o = self.outputLayer.output
            self.cost = T.nnet.binary_crossentropy(self.o, self.y).mean()
            self.prediction = T.argsort(self.o)
            self.accuracy = self.y[T.argmax(self.o)]
            self.accuracy3 = (1.0/3.0) * (self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]])
            self.accuracy5 = (1.0/5.0) * (self.y[self.prediction[-5]]+self.y[self.prediction[-4]]+self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]])

        self.optimiser = sgd_optimizer(self, 'ResNet')
    def __init__(self,
                 atari_env,
                 state_dimension,
                 action_dimension,
                 monitor_env=False,
                 learning_rate=0.001,
                 critic_update=10,
                 train_step=1,
                 gamma=0.95,
                 eps_max=1.0,
                 eps_min=0.1,
                 eps_decay=10000,
                 n_epochs=10000,
                 batch_size=32,
                 buffer_size=50000):

        self.env = gym.make(atari_env)
        if monitor_env:
            None

        self.state_dimension = state_dimension
        self.action_dimension = action_dimension
        self.learning_rate = learning_rate
        self.critic_update = critic_update
        self.train_step = train_step
        self.gamma = gamma
        self.eps_max = eps_max
        self.eps_min = eps_min
        self.eps_decay = eps_decay
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.buffer_size = buffer_size

        self.experience_replay = []

        def q_network(state):
            input_state = InputLayer(input_var=state,
                                     shape=(None, self.state_dimension[0],
                                            self.state_dimension[1],
                                            self.state_dimension[2]))

            input_state = DimshuffleLayer(input_state, pattern=(0, 3, 1, 2))

            conv = Conv2DLayer(input_state,
                               num_filters=32,
                               filter_size=(8, 8),
                               stride=(4, 4),
                               nonlinearity=rectify)

            conv = Conv2DLayer(conv,
                               num_filters=64,
                               filter_size=(4, 4),
                               stride=(2, 2),
                               nonlinearity=rectify)

            conv = Conv2DLayer(conv,
                               num_filters=64,
                               filter_size=(3, 3),
                               stride=(1, 1),
                               nonlinearity=rectify)

            flatten = FlattenLayer(conv)

            dense = DenseLayer(flatten, num_units=512, nonlinearity=rectify)

            q_values = DenseLayer(dense,
                                  num_units=self.action_dimension,
                                  nonlinearity=linear)

            return q_values

        self.X_state = T.ftensor4()
        self.X_action = T.bvector()
        self.X_reward = T.fvector()
        self.X_next_state = T.ftensor4()
        self.X_done = T.bvector()

        self.X_action_hot = to_one_hot(self.X_action, self.action_dimension)

        self.q_ = q_network(self.X_state)
        self.q = get_output(self.q_)
        self.q_target_ = q_network(self.X_next_state)
        self.q_target = get_output(self.q_target_)
        self.q_max = T.max(self.q_target, axis=1)
        self.action = T.argmax(self.q, axis=1)

        self.mu = theano.function(inputs=[self.X_state],
                                  outputs=self.action,
                                  allow_input_downcast=True)

        self.loss = squared_error(
            self.X_reward + self.gamma * self.q_max * (1.0 - self.X_done),
            T.batched_dot(self.q, self.X_action_hot))
        self.loss = self.loss.mean()

        self.params = get_all_params(self.q_)

        self.grads = T.grad(self.loss, self.params)

        self.normed_grads = total_norm_constraint(self.grads, 1.0)

        self.updates = rmsprop(self.normed_grads,
                               self.params,
                               learning_rate=self.learning_rate)

        self.update_network = theano.function(inputs=[
            self.X_state, self.X_action, self.X_reward, self.X_next_state,
            self.X_done
        ],
                                              outputs=self.loss,
                                              updates=self.updates,
                                              allow_input_downcast=True)
    def __init__(self,n_hidden,embedding_dimention=50,feature_dimention=61):

        ##n_in: sequence lstm 的输入维度
        ##n_hidden: lstm for candi and zp 的隐层维度

        #repre_active = ReLU
        repre_active = linear

        self.params = []
        self.w_embedding = init_weight_file(args.embedding,args.embedding_dimention)
        self.params.append(self.w_embedding)

        self.zp_x_pre_index = T.imatrix("zp_x_pre")
        self.zp_x_post_index = T.imatrix("zp_x_post")

        zp_x_pre_newshape = (T.shape(self.zp_x_pre_index)[0],args.embedding_dimention)
        self.embedding_sub_zp_pre = self.w_embedding[self.zp_x_pre_index.flatten()]
        self.zp_x_pre = T.reshape(self.embedding_sub_zp_pre,zp_x_pre_newshape)

        zp_x_post_newshape = (T.shape(self.zp_x_post_index)[0],args.embedding_dimention)
        self.embedding_sub_zp_post = self.w_embedding[self.zp_x_post_index.flatten()]
        self.zp_x_post = T.reshape(self.embedding_sub_zp_post,zp_x_post_newshape)

        zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre)
        self.params += zp_nn_pre.params
        
        zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post)
        self.params += zp_nn_post.params

        attention_pre_on_post = softmax((zp_nn_pre.nn_out*zp_nn_post.all_hidden).sum(axis=1))[0]
        attention_post_on_pre = softmax((zp_nn_post.nn_out*zp_nn_pre.all_hidden).sum(axis=1))[0] 

        zp_post = T.sum(attention_pre_on_post[:,None]*zp_nn_post.all_hidden,axis=0)
        zp_pre = T.sum(attention_post_on_pre[:,None]*zp_nn_pre.all_hidden,axis=0)

        #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out))
        self.zp_out = T.concatenate((zp_post,zp_pre))

        self.zp_out_output = self.zp_out


        ### get sequence output for NP ###
        self.np_x_post_index = T.itensor3("np_x")
        self.np_x_postc_index = T.itensor3("np_x")
        self.np_x_pre_index = T.itensor3("np_x")
        self.np_x_prec_index = T.itensor3("np_x")

        np_x_post_newshape = (T.shape(self.np_x_post_index)[0],T.shape(self.np_x_post_index)[1],args.embedding_dimention)
        self.embedding_sub_np_x_post = self.w_embedding[self.np_x_post_index.flatten()]
        self.np_x_post = T.reshape(self.embedding_sub_np_x_post,np_x_post_newshape)

        np_x_postc_newshape = (T.shape(self.np_x_postc_index)[0],T.shape(self.np_x_postc_index)[1],args.embedding_dimention)
        self.embedding_sub_np_x_postc = self.w_embedding[self.np_x_postc_index.flatten()]
        self.np_x_postc = T.reshape(self.embedding_sub_np_x_postc,np_x_postc_newshape)

        np_x_pre_newshape = (T.shape(self.np_x_pre_index)[0],T.shape(self.np_x_pre_index)[1],args.embedding_dimention)
        self.embedding_sub_np_x_pre = self.w_embedding[self.np_x_pre_index.flatten()]
        self.np_x_pre = T.reshape(self.embedding_sub_np_x_pre,np_x_pre_newshape)

        np_x_prec_newshape = (T.shape(self.np_x_prec_index)[0],T.shape(self.np_x_prec_index)[1],args.embedding_dimention)
        self.embedding_sub_np_x_prec = self.w_embedding[self.np_x_prec_index.flatten()]
        self.np_x_prec = T.reshape(self.embedding_sub_np_x_prec,np_x_prec_newshape)

        self.mask_pre = T.matrix("mask")
        self.mask_prec = T.matrix("mask")

        self.mask_post = T.matrix("mask")
        self.mask_postc = T.matrix("mask")
    
        self.np_nn_pre = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_pre,self.np_x_prec,self.mask_pre,self.mask_prec)
        self.params += self.np_nn_pre.params
        self.np_nn_post = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_post,self.np_x_postc,self.mask_post,self.mask_postc)
        self.params += self.np_nn_post.params

        self.np_nn_post_output = self.np_nn_post.nn_out
        self.np_nn_pre_output = self.np_nn_pre.nn_out

        self.np_out = T.concatenate((self.np_nn_post_output,self.np_nn_pre_output),axis=1)

        np_nn_f = LSTM(n_hidden*2,n_hidden*2,self.np_out)
        self.params += np_nn_f.params
        np_nn_b = LSTM(n_hidden*2,n_hidden*2,self.np_out[::-1])
        self.params += np_nn_b.params

        self.bi_np_out = T.concatenate((np_nn_f.all_hidden,np_nn_b.all_hidden[::-1]),axis=1)

        self.np_out_output = self.bi_np_out
        #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output])

        self.feature = T.matrix("feature")
        self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active) 
        self.params += self.feature_layer.params

        w_attention_zp,b_attention = init_weight(n_hidden*2,1,pre="attention_zp",ones=False) 
        self.params += [w_attention_zp,b_attention]

        #w_attention_np,b_u = init_weight(n_hidden*2,1,pre="attention_np",ones=False) 
        #self.params += [w_attention_np]

        w_attention_np_rnn,b_u = init_weight(n_hidden*4,1,pre="attention_np_rnn",ones=False) 
        self.params += [w_attention_np_rnn]

        w_attention_feature,b_u = init_weight(n_hidden,1,pre="attention_feature",ones=False) 
        self.params += [w_attention_feature]

        #self.calcu_attention = tanh(T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention)
        #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention)
        self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention)
        #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention)

        self.attention = softmax(T.transpose(self.calcu_attention,axes=(1,0)))[0]

        self.out = self.attention

        self.get_out = theano.function(inputs=[self.zp_x_pre_index,self.zp_x_post_index,self.np_x_pre_index,self.np_x_prec_index,self.np_x_post_index,self.np_x_postc_index,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature],outputs=[self.out],on_unused_input='warn')
        
        l1_norm_squared = sum([(w**2).sum() for w in self.params])
        l2_norm_squared = sum([(abs(w)).sum() for w in self.params])

        lmbda_l1 = 0.0
        #lmbda_l2 = 0.001
        lmbda_l2 = 0.0

        t = T.bvector()
        cost = -(T.log((self.out*t).sum()))

        lr = T.scalar()
        
        updates = lasagne.updates.sgd(cost, self.params, lr)
        #updates = lasagne.updates.adadelta(cost, self.params)

        
        self.train_step = theano.function(
            inputs=[self.zp_x_pre_index,self.zp_x_post_index,self.np_x_pre_index,self.np_x_prec_index,self.np_x_post_index,self.np_x_postc_index,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature,t,lr],
            outputs=[cost],
            on_unused_input='warn',
            updates=updates)
Beispiel #27
0
    def fit(self, data, sample_store=10000000):
        '''
        Trains the network.

        Parameters
        --------
        data : pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
        sample_store : int
            If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary).
            This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM.
            For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely).

        '''
        self.predict = None
        self.error_during_train = False
        itemids = data[self.item_key].unique()
        self.n_items = len(itemids)
        self.itemidmap = pd.Series(data=np.arange(self.n_items), index=itemids)
        data = pd.merge(data, pd.DataFrame({self.item_key:itemids, 'ItemIdx':self.itemidmap[itemids].values}), on=self.item_key, how='inner')
        offset_sessions = self.init(data)
        if self.n_sample:
            pop = data.groupby('ItemId').size()
            pop = pop[self.itemidmap.index.values].values**self.sample_alpha
            pop = pop.cumsum() / pop.sum()
            pop[-1] = 1
            if sample_store:
                generate_length = sample_store // self.n_sample
                if generate_length <= 1:
                    sample_store = 0
                    print('No example store was used')
                else:
                    neg_samples = self.generate_neg_samples(pop, generate_length)
                    sample_pointer = 0
            else:
                print('No example store was used')
        X = T.ivector()
        Y = T.ivector()
        M = T.iscalar()
        R = T.bvector()
        H_new, Y_pred, sparams, full_params, sidxs = self.model(X, self.H, M, R, Y, self.dropout_p_hidden, self.dropout_p_embed)
        cost = (M/self.batch_size) * self.loss_function(Y_pred, M)
        params = [self.Wx if self.embedding or self.constrained_embedding else self.Wx[1:], self.Wh, self.Wrz, self.Bh]
        updates = self.RMSprop(cost, params, full_params, sparams, sidxs)
        for i in range(len(self.H)):
            updates[self.H[i]] = H_new[i]
        train_function = function(inputs=[X, Y, M, R], outputs=cost, updates=updates, allow_input_downcast=True)
        base_order = np.argsort(data.groupby(self.session_key)[self.time_key].min().values) if self.time_sort else np.arange(len(offset_sessions)-1)
        data_items = data.ItemIdx.values
        for epoch in range(self.n_epochs):
            for i in range(len(self.layers)):
                self.H[i].set_value(np.zeros((self.batch_size,self.layers[i]), dtype=theano.config.floatX), borrow=True)
            c = []
            cc = []
            session_idx_arr = np.random.permutation(len(offset_sessions)-1) if self.train_random_order else base_order
            iters = np.arange(self.batch_size)
            maxiter = iters.max()
            start = offset_sessions[session_idx_arr[iters]]
            end = offset_sessions[session_idx_arr[iters]+1]
            finished = False
            while not finished:
                minlen = (end-start).min()
                out_idx = data_items[start]
                for i in range(minlen-1):
                    in_idx = out_idx
                    out_idx = data_items[start+i+1]
                    if self.n_sample:
                        if sample_store:
                            if sample_pointer == generate_length:
                                neg_samples = self.generate_neg_samples(pop, generate_length)
                                sample_pointer = 0
                            sample = neg_samples[sample_pointer]
                            sample_pointer += 1
                        else:
                            sample = self.generate_neg_samples(pop, 1)
                        y = np.hstack([out_idx, sample])
                    else:
                        y = out_idx
                        if self.n_sample:
                            if sample_pointer == generate_length:
                                generate_samples()
                                sample_pointer = 0
                            sample_pointer += 1
                    reset = (start+i+1 == end-1)
                    cost = train_function(in_idx, y, len(iters), reset)
                    c.append(cost)
                    cc.append(len(iters))
                    if np.isnan(cost):
                        print(str(epoch) + ': NaN error!')
                        self.error_during_train = True
                        return
                start = start+minlen-1
                finished_mask = (end-start<=1)
                n_finished = finished_mask.sum()
                iters[finished_mask] = maxiter + np.arange(1,n_finished+1)
                maxiter += n_finished
                valid_mask = (iters < len(offset_sessions)-1)
                n_valid = valid_mask.sum()
                if (n_valid == 0) or (n_valid < 2 and self.n_sample == 0):
                    finished = True
                    break
                mask = finished_mask & valid_mask
                sessions = session_idx_arr[iters[mask]]
                start[mask] = offset_sessions[sessions]
                end[mask] = offset_sessions[sessions+1]
                iters = iters[valid_mask]
                start = start[valid_mask]
                end = end[valid_mask]
                if n_valid < len(valid_mask):
                    for i in range(len(self.H)):
                        tmp = self.H[i].get_value(borrow=True)
                        tmp = tmp[valid_mask]
                        self.H[i].set_value(tmp, borrow=True)
            c = np.array(c)
            cc = np.array(cc)
            avgc = np.sum(c * cc) / np.sum(cc)
            if np.isnan(avgc):
                print('Epoch {}: NaN error!'.format(str(epoch)))
                self.error_during_train = True
                return
            print('Epoch{}\tloss: {:.6f}'.format(epoch, avgc))
Beispiel #28
0
    def _init_model(self, in_size, out_size, slot_sizes, db, \
            n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \
            inputtype='full', sl='e2e', rl='e2e'):
        self.in_size = in_size
        self.out_size = out_size
        self.slot_sizes = slot_sizes
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid
        self.r_hid = self.n_hid
        self.sl = sl
        self.rl = rl

        table = db.table
        counts = db.counts
        m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots]
        prior = [db.priors[s] for s in dialog_config.inform_slots]
        unknown = [db.unks[s] for s in dialog_config.inform_slots]
        ids = [db.ids[s] for s in dialog_config.inform_slots]

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \
                T.btensor3('am'), T.fvector('r')
        T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable(
            counts)
        db_index_var = T.imatrix('db')
        db_index_switch = T.bvector('s')

        l_mask_in = L.InputLayer(shape=(None, None), input_var=turn_mask)
        flat_mask = T.reshape(turn_mask,
                              (turn_mask.shape[0] * turn_mask.shape[1], 1))

        def _smooth(p):
            p_n = p + EPS
            return p_n / (p_n.sum(axis=1)[:, np.newaxis])

        def _add_unk(p, m, N):
            # p: B x V, m- num missing, N- total, p0: 1 x V
            t_unk = T.as_tensor_variable(float(m) / N)
            ps = p * (1. - t_unk)
            return T.concatenate([ps, T.tile(t_unk, (ps.shape[0], 1))], axis=1)

        def kl_divergence(p, q):
            p_n = _smooth(p)
            return -T.sum(q * T.log(p_n), axis=1)

        # belief tracking
        l_in = L.InputLayer(shape=(None, None, self.in_size),
                            input_var=input_var)
        p_vars = []
        pu_vars = []
        phi_vars = []
        p_targets = []
        phi_targets = []
        hid_in_vars = []
        hid_out_vars = []
        bt_loss = T.as_tensor_variable(0.)
        kl_loss = []
        x_loss = []
        self.trackers = []
        for i, s in enumerate(dialog_config.inform_slots):
            hid_in = T.fmatrix('h')
            l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in,  \
                    mask_input=l_mask_in,
                    grad_clipping=10.) # B x H x D
            l_b_in = L.ReshapeLayer(l_rnn,
                                    (input_var.shape[0] * input_var.shape[1],
                                     self.r_hid))  # BH x D
            hid_out = L.get_output(l_rnn)[:, -1, :]

            p_targ = T.ftensor3('p_target_' + s)
            p_t = T.reshape(
                p_targ,
                (p_targ.shape[0] * p_targ.shape[1], self.slot_sizes[i]))
            phi_targ = T.fmatrix('phi_target' + s)
            phi_t = T.reshape(phi_targ,
                              (phi_targ.shape[0] * phi_targ.shape[1], 1))

            l_b = L.DenseLayer(l_b_in,
                               self.slot_sizes[i],
                               nonlinearity=lasagne.nonlinearities.softmax)
            l_phi = L.DenseLayer(l_b_in,
                                 1,
                                 nonlinearity=lasagne.nonlinearities.sigmoid)

            phi = T.clip(L.get_output(l_phi), 0.01, 0.99)
            p = L.get_output(l_b)
            p_u = _add_unk(p, m_unk[i], db.N)
            kl_loss.append(
                T.sum(flat_mask.flatten() * kl_divergence(p, p_t)) /
                T.sum(flat_mask))
            x_loss.append(
                T.sum(flat_mask *
                      lasagne.objectives.binary_crossentropy(phi, phi_t)) /
                T.sum(flat_mask))
            bt_loss += kl_loss[-1] + x_loss[-1]

            p_vars.append(p)
            pu_vars.append(p_u)
            phi_vars.append(phi)
            p_targets.append(p_targ)
            phi_targets.append(phi_targ)
            hid_in_vars.append(hid_in)
            hid_out_vars.append(hid_out)
            self.trackers.append(l_b)
            self.trackers.append(l_phi)
        self.bt_params = L.get_all_params(self.trackers)

        def check_db(pv, phi, Tb, N):
            O = T.alloc(0., pv[0].shape[0], Tb.shape[0])  # BH x T.shape[0]
            for i, p in enumerate(pv):
                p_dc = T.tile(phi[i], (1, Tb.shape[0]))
                O += T.log(p_dc*(1./db.table.shape[0]) + \
                        (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i]))
            Op = T.exp(O)  #+EPS # BH x T.shape[0]
            Os = T.sum(Op, axis=1)[:, np.newaxis]  # BH x 1
            return Op / Os

        def entropy(p):
            p = _smooth(p)
            return -T.sum(p * T.log(p), axis=-1)

        def weighted_entropy(p, q, p0, unks, idd):
            w = T.dot(idd, q.transpose())  # Pi x BH
            u = p0[np.newaxis, :] * (q[:, unks].sum(axis=1)[:, np.newaxis]
                                     )  # BH x Pi
            p_tilde = w.transpose() + u
            return entropy(p_tilde)

        p_db = check_db(pu_vars, phi_vars, T_var, N_var)  # BH x T.shape[0]

        if inputtype == 'entropy':
            H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \
                    for i,pv in enumerate(p_vars)]
            H_db = entropy(p_db)
            phv = [ph[:, 0] for ph in phi_vars]
            t_in = T.stacklists(H_vars + phv + [H_db]).transpose()  # BH x 2M+1
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x 2M+1
            l_in_pol = L.InputLayer(
                    shape=(None,None,2*len(dialog_config.inform_slots)+1), \
                    input_var=t_in_resh)
        else:
            in_reshaped = T.reshape(input_var,
                    (input_var.shape[0]*input_var.shape[1], \
                    input_var.shape[2]))
            prev_act = in_reshaped[:, -len(dialog_config.inform_slots):]
            t_in = T.concatenate(pu_vars + phi_vars + [p_db, prev_act],
                                 axis=1)  # BH x D-sum+A
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x D-sum
            l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \
                    3*len(dialog_config.inform_slots)+ \
                    table.shape[0]), input_var=t_in_resh)

        pol_in = T.fmatrix('pol-h')
        l_pol_rnn = L.GRULayer(l_in_pol,
                               n_hid,
                               hid_init=pol_in,
                               mask_input=l_mask_in,
                               grad_clipping=10.)  # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:, -1, :]
        l_den_in = L.ReshapeLayer(
            l_pol_rnn,
            (turn_mask.shape[0] * turn_mask.shape[1], n_hid))  # BH x D
        l_out = L.DenseLayer(l_den_in, self.out_size, \
                nonlinearity=lasagne.nonlinearities.softmax) # BH x A

        self.network = l_out
        self.pol_params = L.get_all_params(self.network)
        self.params = self.bt_params + self.pol_params

        # db loss
        p_db_reshaped = T.reshape(
            p_db, (turn_mask.shape[0], turn_mask.shape[1], table.shape[0]))
        p_db_final = p_db_reshaped[:, -1, :]  # B x T.shape[0]
        p_db_final = _smooth(p_db_final)
        ix = T.tile(T.arange(p_db_final.shape[0]),
                    (db_index_var.shape[1], 1)).transpose()
        sample_probs = p_db_final[ix, db_index_var]  # B x K
        if dialog_config.SUCCESS_MAX_RANK == 1:
            log_db_probs = T.log(sample_probs).sum(axis=1)
        else:
            cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \
                    outputs_info=T.zeros_like(sample_probs[:,0]), \
                    sequences=sample_probs[:,:-1].transpose())
            cum_probs = T.clip(cum_probs.transpose(), 0., 1. - 1e-5)  # B x K-1
            log_db_probs = T.log(sample_probs).sum(
                axis=1) - T.log(1. - cum_probs).sum(axis=1)  # B
        log_db_probs = log_db_probs * db_index_switch

        # rl
        probs = L.get_output(self.network)  # BH x A
        probs = _smooth(probs)
        out_probs = T.reshape(probs, (turn_mask.shape[0], turn_mask.shape[1],
                                      self.out_size))  # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs * act_mask).sum(axis=2)  # B x H
        ep_probs = (act_probs * turn_mask).sum(axis=1)  # B
        H_probs = -T.sum(T.sum(out_probs * log_probs, axis=2), axis=1)  # B
        self.act_loss = -T.mean(ep_probs * reward_var)
        self.db_loss = -T.mean(log_db_probs * reward_var)
        self.reg_loss = -T.mean(ment * H_probs)
        self.loss = self.act_loss + self.db_loss + self.reg_loss

        self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \
                pol_in] + hid_in_vars
        self.obj_fn = theano.function(self.inps,
                                      self.loss,
                                      on_unused_input='warn')
        self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \
                [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn')
        self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss],
                                        on_unused_input='warn')
        self._rl_train_fn(self.learning_rate)

        ## sl
        sl_loss = 0. + bt_loss - T.mean(ep_probs)

        if self.sl == 'e2e':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        elif self.sl == 'bel':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        else:
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)

        sl_inps = [input_var, turn_mask, act_mask, pol_in
                   ] + p_targets + phi_targets + hid_in_vars
        self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \
                on_unused_input='warn')
        self.sl_obj_fn = theano.function(sl_inps,
                                         sl_loss,
                                         on_unused_input='warn')
Beispiel #29
0
	def __init__(
		self,
		rng,
		batchsize=100,
		activation=relu
	):
		
		import char_load
		(num_sent, char_cnt, word_cnt, max_word_len, max_sen_len,\
	    k_chr, k_wrd, x_chr, x_wrd, y) = char_load.read("tweets_clean.txt")

		dim_word = 30
		dim_char = 5
		cl_word = 300
		cl_char = 50
		k_word = k_wrd
		k_char = k_chr

		data_train_word,\
		data_test_word,\
		data_train_char,\
		data_test_char,\
		target_train,\
		target_test\
		= train_test_split(x_wrd, x_chr, y, random_state=1234, test_size=0.1)

		x_train_word = theano.shared(np.asarray(data_train_word, dtype='int16'), borrow=True)
		x_train_char = theano.shared(np.asarray(data_train_char, dtype='int16'), borrow=True)
		y_train = theano.shared(np.asarray(target_train, dtype='int8'), borrow=True)
		x_test_word = theano.shared(np.asarray(data_test_word, dtype='int16'), borrow=True)
		x_test_char = theano.shared(np.asarray(data_test_char, dtype='int16'), borrow=True)
		y_test = theano.shared(np.asarray(target_test, dtype='int8'), borrow=True)


		self.n_train_batches = x_train_word.get_value(borrow=True).shape[0] / batchsize
		self.n_test_batches = x_test_word.get_value(borrow=True).shape[0] / batchsize


		
		"""symbol definition"""
		index = T.iscalar()
		x_wrd = T.wmatrix('x_wrd')
		x_chr = T.wtensor3('x_chr')
		y = T.bvector('y')
		train = T.iscalar('train')

		"""network definition"""
		layer_char_embed_input = x_chr#.reshape((batchsize, max_sen_len, max_word_len))

		layer_char_embed = EmbedIDLayer(
			rng,
			layer_char_embed_input,
			n_input=char_cnt,
			n_output=dim_char
		)

		layer1_input = layer_char_embed.output.reshape(
			(batchsize*max_sen_len, 1, max_word_len, dim_char)
		)

		layer1 = ConvolutionalLayer(
			rng,
			layer1_input,
			filter_shape=(cl_char, 1, k_char, dim_char),# cl_charフィルタ数
			image_shape=(batchsize*max_sen_len, 1, max_word_len, dim_char)
		)

		layer2 = MaxPoolingLayer(
			layer1.output,
			poolsize=(max_word_len-k_char+1, 1)
		)

		layer_word_embed_input = x_wrd #.reshape((batchsize, max_sen_len))

		layer_word_embed = EmbedIDLayer(
			rng,
			layer_word_embed_input,
			n_input=word_cnt,
			n_output=dim_word
		)

		layer3_word_input = layer_word_embed.output.reshape((batchsize, 1, max_sen_len, dim_word))
		layer3_char_input = layer2.output.reshape((batchsize, 1, max_sen_len, cl_char))


		layer3_input = T.concatenate(
			[layer3_word_input,
			 layer3_char_input],
			axis=3
		)#.reshape((batchsize, 1, max_sen_len, dim_word+cl_char))


		layer3 = ConvolutionalLayer(
			rng,
			layer3_input,
			filter_shape=(cl_word, 1, k_word, dim_word + cl_char),#1は入力チャネル数
			image_shape=(batchsize, 1, max_sen_len, dim_word + cl_char),
			activation=activation
		)

		layer4 = MaxPoolingLayer(
			layer3.output,
			poolsize=(max_sen_len-k_word+1, 1)
		)

		layer5_input = layer4.output.reshape((batchsize, cl_word))

		layer5 = FullyConnectedLayer(
			rng,
			dropout(rng, layer5_input, train),
			n_input=cl_word,
			n_output=50,
			activation=activation
		)

		layer6_input = layer5.output

		layer6 = FullyConnectedLayer(
			rng,
			dropout(rng, layer6_input, train, p=0.1),
			n_input=50,
			n_output=2,
			activation=None
		)

		result = Result(layer6.output, y)
		loss = result.negative_log_likelihood()
		accuracy = result.accuracy()
		params = layer6.params\
				+layer5.params\
				+layer3.params\
				+layer_word_embed.params\
				+layer1.params\
				+layer_char_embed.params
		updates = RMSprop(learning_rate=0.001, params=params).updates(loss)

		self.train_model = theano.function(
			inputs=[index],
			outputs=[loss, accuracy],
			updates=updates,
			givens={
				x_wrd: x_train_word[index*batchsize: (index+1)*batchsize],
				x_chr: x_train_char[index*batchsize: (index+1)*batchsize],
				y: y_train[index*batchsize: (index+1)*batchsize],
				train: np.cast['int32'](1)
			}
		)

		self.test_model = theano.function(
			inputs=[index],
			outputs=[loss, accuracy],
			givens={
				x_wrd: x_test_word[index*batchsize: (index+1)*batchsize],
				x_chr: x_test_char[index*batchsize: (index+1)*batchsize],
				y: y_test[index*batchsize: (index+1)*batchsize],
				train: np.cast['int32'](0)
			}
		)
Beispiel #30
0
    def __init__(self, n_hidden, embedding_dimention=50, feature_dimention=61):

        ##n_in: sequence lstm 的输入维度
        ##n_hidden: lstm for candi and zp 的隐层维度

        self.params = []

        self.zp_x_pre = T.matrix("zp_x_pre")
        self.zp_x_post = T.matrix("zp_x_post")

        zp_nn_pre = LSTM(embedding_dimention, n_hidden, self.zp_x_pre)
        #zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre_dropout)
        self.params += zp_nn_pre.params

        zp_nn_post = LSTM(embedding_dimention, n_hidden, self.zp_x_post)
        #zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post_dropout)
        self.params += zp_nn_post.params

        self.zp_out = T.concatenate((zp_nn_pre.nn_out, zp_nn_post.nn_out))

        self.zp_out_output = self.zp_out

        ### get sequence output for NP ###
        self.np_x_post = T.tensor3("np_x")
        self.np_x_postc = T.tensor3("np_x")

        self.np_x_pre = T.tensor3("np_x")
        self.np_x_prec = T.tensor3("np_x")

        self.mask_pre = T.matrix("mask")
        self.mask_prec = T.matrix("mask")

        self.mask_post = T.matrix("mask")
        self.mask_postc = T.matrix("mask")

        self.np_nn_pre = sub_LSTM_batch(embedding_dimention, n_hidden,
                                        self.np_x_pre, self.np_x_prec,
                                        self.mask_pre, self.mask_prec)
        self.params += self.np_nn_pre.params
        self.np_nn_post = sub_LSTM_batch(embedding_dimention, n_hidden,
                                         self.np_x_post, self.np_x_postc,
                                         self.mask_post, self.mask_postc)
        self.params += self.np_nn_post.params

        self.np_nn_post_output = self.np_nn_post.nn_out
        self.np_nn_pre_output = self.np_nn_pre.nn_out

        self.np_out = T.concatenate(
            (self.np_nn_post_output, self.np_nn_pre_output), axis=1)

        #np_nn_f = LSTM(n_hidden*2,n_hidden*2,self.np_out)
        np_nn_f = RNN(n_hidden * 2, n_hidden * 2, self.np_out)
        self.params += np_nn_f.params
        #np_nn_b = LSTM(n_hidden*2,n_hidden*2,self.np_out[::-1])
        np_nn_b = RNN(n_hidden * 2, n_hidden * 2, self.np_out[::-1])
        self.params += np_nn_b.params

        self.bi_np_out = T.concatenate(
            (np_nn_f.all_hidden, np_nn_b.all_hidden[::-1]), axis=1)

        self.np_out_output = self.bi_np_out
        self.get_np_out = theano.function(inputs=[
            self.np_x_pre, self.np_x_prec, self.np_x_post, self.np_x_postc,
            self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc
        ],
                                          outputs=[self.np_out_output])

        #self.feature = T.matrix("feature")
        #self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active)
        #self.params += self.feature_layer.params

        w_attention_zp, b_attention = init_weight(n_hidden * 2,
                                                  1,
                                                  pre="attention_zp",
                                                  ones=False)
        self.params += [w_attention_zp, b_attention]

        w_attention_np, b_u = init_weight(n_hidden * 2,
                                          1,
                                          pre="attention_np",
                                          ones=False)
        self.params += [w_attention_np]

        w_attention_np_rnn, b_u = init_weight(n_hidden * 4,
                                              1,
                                              pre="attention_np_rnn",
                                              ones=False)
        self.params += [w_attention_np_rnn]

        #w_attention_feature,b_u = init_weight(n_hidden,1,pre="attention_feature",ones=False)
        #self.params += [w_attention_feature]

        #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention)
        self.calcu_attention = tanh(
            T.dot(self.np_out_output, w_attention_np_rnn) +
            T.dot(self.zp_out_output, w_attention_zp) +
            T.dot(self.np_out, w_attention_np) + b_attention)
        #self.calcu_attention = tanh(T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention)

        self.attention = softmax(T.transpose(self.calcu_attention,
                                             axes=(1, 0)))[0]
        #self.attention = T.transpose(self.calcu_attention,axes=(1,0))[0]

        t = T.bvector()
        max_attention = (self.attention * t).max()

        self.out = self.attention

        self.get_out = theano.function(inputs=[
            self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec,
            self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec,
            self.mask_post, self.mask_postc
        ],
                                       outputs=[self.out],
                                       on_unused_input='warn')

        self.get_max = theano.function(inputs=[
            self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec,
            self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec,
            self.mask_post, self.mask_postc, t
        ],
                                       outputs=[max_attention],
                                       on_unused_input='warn')

        l1_norm_squared = sum([(w**2).sum() for w in self.params])
        l2_norm_squared = sum([(abs(w)).sum() for w in self.params])

        lmbda_l1 = 0.0
        #lmbda_l2 = 0.001
        lmbda_l2 = 0.0

        cost = ((1 - t) * (1 - max_attention + self.out)).sum()
        #cost = -(T.log((self.out*t).sum()))

        lr = T.scalar()

        updates = lasagne.updates.sgd(cost, self.params, lr)
        #updates = lasagne.updates.adadelta(cost, self.params)
        #updates = lasagne.updates.adam(cost, self.params)

        self.train_step = theano.function(inputs=[
            self.zp_x_pre, self.zp_x_post, self.np_x_pre, self.np_x_prec,
            self.np_x_post, self.np_x_postc, self.mask_pre, self.mask_prec,
            self.mask_post, self.mask_postc, t, lr
        ],
                                          outputs=[cost],
                                          on_unused_input='warn',
                                          updates=updates)
Beispiel #31
0
    def __init__(self,
                 nkerns,
                 recept_width,
                 pool_width,
                 dropout_prob,
                 training_batch_size,
                 activation,
                 n_timesteps=1000,
                 dim=18):

        if activation == 'tanh':
            activation_function = lambda x: T.tanh(x)
        elif activation == 'relu':
            activation_function = lambda x: T.maximum(0.0, x)
        else:
            raise ValueError('unknown activation function')

        self.training_batch_size = training_batch_size

        rng = np.random.RandomState(23455)

        self.training_mode = T.iscalar('training_mode')
        self.x = T.matrix('x')
        self.y = T.bvector('y')
        self.batch_size = theano.shared(self.training_batch_size)

        # 18@1*1000
        self.layer0_input = self.x.reshape(
            (self.batch_size, dim, 1, n_timesteps))

        # image 18 @ 1*1000
        # c1: nkerns[0] @ 1* (1000 - recept_width[0] + 1)
        # s2: nkerns[0] @ 1 * c1 / pool_width[0]
        layer0 = ConvPoolLayer(rng,
                               input=self.layer0_input,
                               image_shape=(None, dim, 1, n_timesteps),
                               filter_shape=(nkerns[0], dim, 1,
                                             recept_width[0]),
                               poolsize=(1, pool_width[0]),
                               activation_function=activation_function)

        # c3: nkerns[1] @ 1 * (s2 - recept_width[1] + 1)
        # s4  nkerns[1] @ 1 *  c3 / pool_width
        input_layer1_width = (n_timesteps - recept_width[0] +
                              1) / pool_width[0]
        layer1 = ConvPoolLayer(rng,
                               input=layer0.output,
                               image_shape=(None, nkerns[0], 1,
                                            input_layer1_width),
                               filter_shape=(nkerns[1], nkerns[0], 1,
                                             recept_width[1]),
                               poolsize=(1, pool_width[1]),
                               activation_function=activation_function)

        # s4:(batch_size, nkerns[1], 1, s4) -> flatten(2) -> (batch_size, nkerns[1]* 1 * s4)
        layer2_input = layer1.output.flatten(2)

        input_layer2_size = (input_layer1_width - recept_width[1] +
                             1) / pool_width[1]
        # c5: 120@1*1
        self.layer2 = HiddenLayer(rng=rng,
                                  input=layer2_input,
                                  n_in=nkerns[1] * 1 * input_layer2_size,
                                  n_out=nkerns[2],
                                  training_mode=self.training_mode,
                                  dropout_prob=dropout_prob,
                                  activation_function=activation_function)
        # f6/output
        self.layer3 = LogisticRegressionLayer(input=self.layer2.output,
                                              n_in=nkerns[2],
                                              n_out=2,
                                              training_mode=self.training_mode,
                                              dropout_prob=dropout_prob)

        self.params = self.layer3.params + self.layer2.params + layer1.params + layer0.params
Beispiel #32
0
    def initialize(self, policy, env_spec, sample_size, horizon,
                   mid_batch_reset):
        if mid_batch_reset and policy.recurrent:
            raise NotImplementedError

        obs = env_spec.observation_space.new_tensor_variable('obs',
                                                             extra_dims=1)
        act = env_spec.action_space.new_tensor_variable('act', extra_dims=1)
        adv = T.vector('adv')
        ret = T.vector('ret')
        old_value = T.vector('old_value')

        dist = policy.distribution
        old_dist_info = {
            k: T.matrix('old_%s' % k)
            for k in dist.dist_info_keys
        }
        self._dist_info_keys = dist.dist_info_keys
        state_info = {k: T.matrix(k) for k in policy.state_info_keys}
        self._state_info_keys = policy.state_info_keys
        new_dist_info = policy.dist_info_sym(obs, state_info_vars=state_info)
        new_value = policy.value_sym(obs, state_info_vars=state_info)

        self._lr_mult = theano.shared(np.array(1., dtype=theano.config.floatX),
                                      name='lr_mult')

        if mid_batch_reset and not policy.recurrent:
            self._use_valids = False
            valids = None  # will be ignored inside valids_mean()
        else:
            self._use_valids = True
            valids = T.bvector('valids')  # dtype int8

        v_err = (new_value - ret)**2
        v_loss = self.v_loss_coeff * valids_mean(v_err, valids)
        ent = policy.distribution.entropy_sym(new_dist_info)
        ent_loss = -self.ent_loss_coeff * valids_mean(ent, valids)
        pi_loss = \
            self.pi_loss(policy, act, adv, old_dist_info, new_dist_info, valids)
        losses = (pi_loss, v_loss, ent_loss)

        pi_kl = valids_mean(dist.kl_sym(old_dist_info, new_dist_info), valids)
        v_kl = valids_mean((new_value - old_value)**2, valids)
        constraints = (pi_kl, v_kl)

        input_list = [obs, act, adv, ret, old_value]
        old_dist_info_list = [old_dist_info[k] for k in dist.dist_info_keys]
        state_info_list = [state_info[k] for k in policy.state_info_keys]
        input_list += old_dist_info_list + state_info_list

        opt_examples = dict(
            advantages=np.array(1, dtype=adv.dtype),
            returns=np.array(1, dtype=ret.dtype),
        )
        if self._use_valids:
            input_list.append(valids)
            opt_examples["valids"] = np.array(1, dtype=np.int8)

        self.optimizer.initialize(
            inputs=input_list,
            losses=losses,
            constraints=constraints,
            target=policy,
            lr_mult=self._lr_mult,
        )

        self._opt_buf = buffer_with_segs_view(opt_examples,
                                              sample_size,
                                              horizon,
                                              shared=False)
        self._batch_size = sample_size
        self._mid_batch_reset = mid_batch_reset
        self._horizon = horizon

        self.policy = policy
Beispiel #33
0
 def inputs(self):
     return {
         'call_type':
         tensor.bvector('call_type'),
         'origin_call':
         tensor.ivector('origin_call'),
         'origin_stand':
         tensor.bvector('origin_stand'),
         'taxi_id':
         tensor.wvector('taxi_id'),
         'timestamp':
         tensor.ivector('timestamp'),
         'day_type':
         tensor.bvector('day_type'),
         'missing_data':
         tensor.bvector('missing_data'),
         'latitude':
         tensor.matrix('latitude'),
         'longitude':
         tensor.matrix('longitude'),
         'destination_latitude':
         tensor.vector('destination_latitude'),
         'destination_longitude':
         tensor.vector('destination_longitude'),
         'travel_time':
         tensor.ivector('travel_time'),
         'first_k_latitude':
         tensor.matrix('first_k_latitude'),
         'first_k_longitude':
         tensor.matrix('first_k_longitude'),
         'last_k_latitude':
         tensor.matrix('last_k_latitude'),
         'last_k_longitude':
         tensor.matrix('last_k_longitude'),
         'input_time':
         tensor.ivector('input_time'),
         'week_of_year':
         tensor.bvector('week_of_year'),
         'day_of_week':
         tensor.bvector('day_of_week'),
         'qhour_of_day':
         tensor.bvector('qhour_of_day'),
         'candidate_call_type':
         tensor.bvector('candidate_call_type'),
         'candidate_origin_call':
         tensor.ivector('candidate_origin_call'),
         'candidate_origin_stand':
         tensor.bvector('candidate_origin_stand'),
         'candidate_taxi_id':
         tensor.wvector('candidate_taxi_id'),
         'candidate_timestamp':
         tensor.ivector('candidate_timestamp'),
         'candidate_day_type':
         tensor.bvector('candidate_day_type'),
         'candidate_missing_data':
         tensor.bvector('candidate_missing_data'),
         'candidate_latitude':
         tensor.matrix('candidate_latitude'),
         'candidate_longitude':
         tensor.matrix('candidate_longitude'),
         'candidate_destination_latitude':
         tensor.vector('candidate_destination_latitude'),
         'candidate_destination_longitude':
         tensor.vector('candidate_destination_longitude'),
         'candidate_travel_time':
         tensor.ivector('candidate_travel_time'),
         'candidate_first_k_latitude':
         tensor.matrix('candidate_first_k_latitude'),
         'candidate_first_k_longitude':
         tensor.matrix('candidate_first_k_longitude'),
         'candidate_last_k_latitude':
         tensor.matrix('candidate_last_k_latitude'),
         'candidate_last_k_longitude':
         tensor.matrix('candidate_last_k_longitude'),
         'candidate_input_time':
         tensor.ivector('candidate_input_time'),
         'candidate_week_of_year':
         tensor.bvector('candidate_week_of_year'),
         'candidate_day_of_week':
         tensor.bvector('candidate_day_of_week'),
         'candidate_qhour_of_day':
         tensor.bvector('candidate_qhour_of_day')
     }
Beispiel #34
0
    def get_updates_functions(self):
        tind = T.ivector('ind')

        if self.NMF_updates == 'beta':
            print "Standard rules for beta-divergence"
            H_update = T.set_subtensor(self.H[tind[3]:tind[4], ],
                                       updates.beta_H(self.X_buff[tind[1]:tind[2], ],
                                                      self.W[tind[0]],
                                                      self.H[tind[3]:tind[4], ],
                                                      self.beta))
            W_update = T.set_subtensor(self.W[tind[0]],
                                       updates.beta_W(self.X_buff[tind[1]:tind[2], ],
                                                      self.W[tind[0]],
                                                      self.H[tind[3]:tind[4], ],
                                                      self.beta))
            self.trainH = theano.function(inputs=[tind],
                                          outputs=[],
                                          updates={self.H: H_update},
                                          name="trainH",
                                          allow_input_downcast=True)
            self.trainW = theano.function(inputs=[tind],
                                          outputs=[],
                                          updates={self.W: W_update},
                                          name="trainW",
                                          allow_input_downcast=True)

        if self.NMF_updates == 'groupNMF':
            tcomp = T.ivector('comp')
            tlambda = T.fvector('lambda')
            tcard = T.bvector('card')

            print "Group NMF with class specific rules for beta-divergence"
            if self.dist_mode=='iter':
                tparams = [tind, tcomp, tlambda, tcard]
                print "Compute contraint distances once per iteration" 
                H_update = T.set_subtensor(self.H[tind[3]:tind[4], ],
                                           updates.group_H(self.X_buff[tind[1]:tind[2], ],
                                                           self.W[tind[0]],
                                                           self.H,
                                                           self.beta,
                                                           tparams))
                W_update = T.set_subtensor(self.W[tind[0]],
                                           updates.group_W_nosum(self.X_buff[tind[1]:tind[2], ],
                                                           self.W,
                                                           self.H[tind[3]:tind[4], ],
                                                           self.cls_sums[tind[5]],
                                                           self.ses_sums[tind[6]],
                                                           self.beta,
                                                           tparams))
                self.trainH = theano.function(inputs=[tind,
                                                      tcomp,
                                                      tlambda,
                                                      tcard],
                                              outputs=[],
                                              updates={self.H: H_update},
                                              name="trainH",
                                              on_unused_input='ignore',
                                              allow_input_downcast=True)
                self.trainW = theano.function(inputs=[tind,
                                                      tcomp,
                                                      tlambda,
                                                      tcard],
                                              outputs=[],
                                              updates={self.W: W_update},
                                              name="trainW",
                                              on_unused_input='ignore',
                                              allow_input_downcast=True)

            else:
                print "Compute contraint distances at each segment update"  
                tSc = T.ivector('Sc')
                tCs = T.ivector('Cs')
                tparams = [tind, tcomp, tlambda, tSc, tCs, tcard]                
                H_update = T.set_subtensor(self.H[tind[3]:tind[4], ],
                                           updates.group_H(self.X_buff[tind[1]:tind[2], ],
                                                           self.W[tind[0]],
                                                           self.H,
                                                           self.beta,
                                                           tparams))
                W_update = T.set_subtensor(self.W[tind[0]],
                                           updates.group_W(self.X_buff[tind[1]:tind[2], ],
                                                           self.W,
                                                           self.H[tind[3]:tind[4], ],
                                                           self.beta,
                                                           tparams))
                self.trainH = theano.function(inputs=[tind,
                                                      tcomp,
                                                      tlambda,
                                                      tSc,
                                                      tCs,
                                                      tcard],
                                              outputs=[],
                                              updates={self.H: H_update},
                                              name="trainH",
                                              on_unused_input='ignore',
                                              allow_input_downcast=True)
                self.trainW = theano.function(inputs=[tind,
                                                      tcomp,
                                                      tlambda,
                                                      tSc,
                                                      tCs,
                                                      tcard],
                                              outputs=[],
                                              updates={self.W: W_update},
                                              name="trainW",
                                              on_unused_input='ignore',
                                              allow_input_downcast=True)
        if self.NMF_updates == 'noiseNMF':
            tcomp = T.ivector('comp')
            tlambda = T.fvector('lambda')
            tcard = T.bvector('card')

            print "Group NMF with noise reference rules for beta-divergence" 
            tSc = T.ivector('Sc')
            tCs = T.ivector('Cs')
            tparams = [tind, tcomp, tlambda, tSc, tCs, tcard]               
            H_update = T.set_subtensor(self.H[tind[3]:tind[4], ],
                                       updates.group_H(self.X_buff[tind[1]:tind[2], ],
                                                       self.W[tind[0]],
                                                       self.H,
                                                       self.beta,
                                                       tparams))
            W_update = T.set_subtensor(self.W[tind[0]],
                                       updates.noise_W(self.X_buff[tind[1]:tind[2], ],
                                                       self.W,
                                                       self.Wn,
                                                       self.H[tind[3]:tind[4], ],
                                                       self.beta,
                                                       tparams))
            self.trainH = theano.function(inputs=[tind,
                                                  tcomp,
                                                  tlambda,
                                                  tSc,
                                                  tCs,
                                                  tcard],
                                          outputs=[],
                                          updates={self.H: H_update},
                                          name="trainH",
                                          on_unused_input='ignore',
                                          allow_input_downcast=True)
            self.trainW = theano.function(inputs=[tind,
                                                  tcomp,
                                                  tlambda,
                                                  tSc,
                                                  tCs,
                                                  tcard],
                                          outputs=[],
                                          updates={self.W: W_update},
                                          name="trainW",
                                          on_unused_input='ignore',
                                          allow_input_downcast=True)
def create_iter_functions(dataset, output_layer,
                          batch_size=MINIBATCH_SIZE
                         ):
    print("Creating IterFunctions...")
    
    batch_index = T.iscalar('batch_index')
    X_batch     = T.imatrix('x')
    
    # See http://stackoverflow.com/questions/25166657/index-gymnastics-inside-a-theano-function
    # And https://bitbucket.org/kostialopuhin/word-models/src/ba4b00bb03c7eee83b11dc729fd4f6a58ab21fb6/word_embeddings.py?at=default
    vectors = dataset['language']['vectors']
    
    #X_batch_flat_vectors =  vectors[X_batch].reshape( (X_batch.shape[0], -1) )  # next line is more explicit, for safety
    X_batch_flat_vectors =  vectors[X_batch].reshape( (X_batch.shape[0], vectors.shape[1]*X_batch.shape[1] ) )
    
    #Y_batch = T.ivector('y') 
    Y_batch = T.bvector('y') # This is smaller...
    batch_slice = slice(
        batch_index * batch_size, (batch_index + 1) * batch_size
    )

    # Output layer vector position assignment :
    # a = NotMissing
    # b = Missing (complex)
    # c-x = Missing a simple word (take shift into account)
    
    def loss(output):
        # This pulls out log(output) at the correct index position for each element of the mini-batch,
        # and takes the mean
        return -T.mean(T.log(output)[T.arange(Y_batch.shape[0]), Y_batch])  

    loss_train = loss(output_layer.get_output(X_batch_flat_vectors))
    loss_eval  = loss(output_layer.get_output(X_batch_flat_vectors, deterministic=True))  # deterministic=True turns off dropout

    # But this (for the first runs) easy to model as a soft-max thing 
    # from 0=(nogap), 1=(complex), 2..(small_limit+2)=small-word
    pred = T.argmax(
        output_layer.get_output(X_batch_flat_vectors, deterministic=True), axis=1
    )
    accuracy = T.mean(T.eq(pred, Y_batch), dtype=theano.config.floatX)  # Would otherwise use float64

    all_params = lasagne.layers.get_all_params(output_layer)
    
    #updates = lasagne.updates.nesterov_momentum(
    #    loss_train, all_params, learning_rate, momentum
    #)
    
    #def adagrad(loss, all_params, learning_rate=1.0, epsilon=1e-6):
    #updates = lasagne.updates.adagrad(
    #    loss_train, all_params #, learning_rate, momentum
    #)

    #def adadelta(loss, all_params, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    updates = lasagne.updates.adadelta(
        loss_train, all_params #, learning_rate, momentum
    )
    
    iters={}
    
    if 'train' in dataset:
        d=dataset['train']
        iters['train'] = theano.function(
            [batch_index], loss_train,
            updates=updates,
            givens={
                X_batch: d['X'][batch_slice],
                Y_batch: d['Y'][batch_slice],
            },
        )

    if 'valid' in dataset:
        d=dataset['valid']
        iters['valid'] = theano.function(
            [batch_index], [loss_eval, accuracy],
            givens={
                X_batch: d['X'][batch_slice],
                Y_batch: d['Y'][batch_slice],
            },
        )

    if 'test' in dataset:
        d=dataset['test']
        iters['test'] = theano.function(
            [batch_index], [loss_eval, accuracy],
            givens={
                X_batch: d['X'][batch_slice],
                Y_batch: d['Y'][batch_slice],
            },
        )

    return iters
Beispiel #36
0
def construct_network(context,characters,hidden):
	print "Setting up memory..."
	X = T.bmatrix('X')
	Y = T.bvector('Y')
	zeros = np.zeros(characters,dtype=np.int8)
	zeros[0] = 1
	zeros[1] = 1

	alpha = T.cast(T.fscalar('alpha'),dtype=theano.config.floatX)
	lr    = T.cast(T.fscalar('lr'),dtype=theano.config.floatX)
	Ws_char_to_hidden   = [
			U.create_shared(
				U.initial_weights(characters,hidden),
				name='char[%d]'%i
			) for i in xrange(context) 
		]
	mat = Ws_char_to_hidden[0].get_value()
	mat[0] = 0
	Ws_char_to_hidden[0].set_value(mat)
	W_hidden_to_hidden_i = U.create_shared(U.initial_weights(hidden,hidden) + np.eye(hidden))
	b_hidden_i           = U.create_shared(U.initial_weights(hidden))
	W_hidden_to_hidden_o = U.create_shared(U.initial_weights(hidden,hidden) + np.eye(hidden))
	b_hidden_o           = U.create_shared(U.initial_weights(hidden))
	W_hidden_to_predict  = U.create_shared(U.initial_weights(hidden,characters))
	b_predict            = U.create_shared(U.initial_weights(characters))
	W_predict_to_hidden  = U.create_shared(U.initial_weights(characters,hidden))
	gen_weight_mask      = U.create_shared(zeros,name='mask')
	print "Constructing graph..."
	hidden_inputs  = make_char_outputs(X,Ws_char_to_hidden)
	hidden_outputs,predictions = make_hidden_predict_outputs(
			hidden,characters,
			hidden_inputs,
			gen_weight_mask[X[:,0]],
			W_hidden_to_hidden_i,
			b_hidden_i,
			W_hidden_to_hidden_o,
			b_hidden_o,
			W_hidden_to_predict,
			b_predict,
			W_predict_to_hidden			
		)


	weights = Ws_char_to_hidden + [
					W_hidden_to_hidden_i,
					b_hidden_i, 
					W_hidden_to_hidden_o,
					b_hidden_o, 
					W_hidden_to_predict,
					b_predict,
					W_predict_to_hidden
				]
	cost    = -T.mean(T.log(predictions)[T.arange(Y.shape[0]),Y])
	gparams =  T.grad(cost,weights)

	deltas  = [ U.create_shared(np.zeros(w.get_value().shape)) for w in weights ]
	updates = [
				( param, param - ( alpha * delta + gparam * lr ) )
					for param,delta,gparam in zip(weights,deltas,gparams)
			] + [
				( delta, alpha * delta + gparam * lr)
					for delta,gparam in zip(deltas,gparams)
			]
	return X,Y,alpha,lr,updates,predictions,weights
Beispiel #37
0
                             num_units    = n_state,
                             nonlinearity = tanh,
                             W         = Normal(0.1, 0.0),
                             b         = Constant(0.0))

    q_values    = DenseLayer(dense_2,
                             num_units    = n_action,
                             nonlinearity = None,
                             W         = Normal(0.1, 0.0),
                             b         = Constant(0.0))

    return q_values

X_next_state     = T.fmatrix()
X_state          = T.fmatrix()
X_action         = T.bvector()
X_reward         = T.fvector()
X_done           = T.bvector()

X_action_hot = to_one_hot(X_action, n_action)

q_        = q_network(X_state);      q        = get_output(q_)
q_target_ = q_network(X_next_state); q_target = get_output(q_target_)
q_max     = T.max(q_target, axis=1)
action    = T.argmax(q, axis=1)

mu = theano.function(inputs               = [X_state],
                     outputs              = action,
                     allow_input_downcast = True)

loss = squared_error(X_reward + gamma * q_max * (1.0 - X_done), T.batched_dot(q, X_action_hot))
    def __init__(self,n_hidden,embedding_dimention=50,feature_dimention=61):

        ##n_in: sequence lstm 的输入维度
        ##n_hidden: lstm for candi and zp 的隐层维度

        #repre_active = ReLU
        repre_active = linear

        self.params = []

        self.zp_x_pre = T.matrix("zp_x_pre")
        self.zp_x_post = T.matrix("zp_x_post")
        
        zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre)
        self.params += zp_nn_pre.params
        
        zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post)
        self.params += zp_nn_post.params

        danwei = theano.shared(np.eye(8, dtype=theano.config.floatX))

        H_pre = zp_nn_pre.all_hidden
        H_post = zp_nn_post.all_hidden

        Ws1_pre,heihei = init_weight(n_hidden,n_hidden,pre="Ws1_pre_zp",ones=False)
        Ws2_pre,heihei = init_weight(8,n_hidden,pre="Ws2_pre_zp",ones=False)
        self.params += [Ws1_pre,Ws2_pre]

        A_pre = softmax(T.dot(Ws2_pre,T.dot(Ws1_pre,T.transpose(H_pre))))

        P_pre = T.dot(A_pre,T.transpose(A_pre))-danwei
        f_norm_pre = (P_pre**2).sum()
        zp_out_pre = T.mean(T.dot(A_pre,H_pre),axis=0)

        Ws1_post,heihei = init_weight(n_hidden,n_hidden,pre="Ws1_post_zp",ones=False)
        Ws2_post,heihei = init_weight(8,n_hidden,pre="Ws2_post_zp",ones=False)
        self.params += [Ws1_post,Ws2_post]
        A_post = softmax(T.dot(Ws2_post,T.dot(Ws1_post,T.transpose(H_post))))

        P_post = T.dot(A_post,T.transpose(A_post))-danwei
        f_norm_post = (P_post**2).sum()
        zp_out_post = T.mean(T.dot(A_post,H_post),axis=0)

        f_norm = f_norm_pre + f_norm_post

        #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out))
        self.zp_out = T.concatenate((zp_out_pre,zp_out_post))

        self.zp_out_output = self.zp_out

        ### get sequence output for NP ###
        self.np_x_post = T.tensor3("np_x")
        self.np_x_postc = T.tensor3("np_x")

        self.np_x_pre = T.tensor3("np_x")
        self.np_x_prec = T.tensor3("np_x")

        self.mask_pre = T.matrix("mask")
        self.mask_prec = T.matrix("mask")

        self.mask_post = T.matrix("mask")
        self.mask_postc = T.matrix("mask")
    
        self.np_nn_pre = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_pre,self.np_x_prec,self.mask_pre,self.mask_prec)
        self.params += self.np_nn_pre.params
        self.np_nn_post = sub_LSTM_batch(embedding_dimention,n_hidden,self.np_x_post,self.np_x_postc,self.mask_post,self.mask_postc)
        self.params += self.np_nn_post.params

        self.np_nn_post_output = self.np_nn_post.nn_out
        self.np_nn_pre_output = self.np_nn_pre.nn_out

        self.np_out = T.concatenate((self.np_nn_post_output,self.np_nn_pre_output),axis=1)

        #np_nn_f = LSTM(n_hidden*2,n_hidden*2,self.np_out)
        #self.params += np_nn_f.params
        #np_nn_b = LSTM(n_hidden*2,n_hidden*2,self.np_out[::-1])
        #self.params += np_nn_b.params

        #self.bi_np_out = T.concatenate((np_nn_f.all_hidden,np_nn_b.all_hidden[::-1]),axis=1)

        #self.np_out_output = self.bi_np_out
        #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output])

        self.feature = T.matrix("feature")
        self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active) 
        self.params += self.feature_layer.params

        w_attention_zp,b_attention = init_weight(n_hidden*2,1,pre="attention_zp",ones=False) 
        self.params += [w_attention_zp,b_attention]

        w_attention_np,b_u = init_weight(n_hidden*2,1,pre="attention_np",ones=False) 
        self.params += [w_attention_np]

        #w_attention_np_rnn,b_u = init_weight(n_hidden*4,1,pre="attention_np_rnn",ones=False) 
        #self.params += [w_attention_np_rnn]

        w_attention_feature,b_u = init_weight(n_hidden,1,pre="attention_feature",ones=False) 
        self.params += [w_attention_feature]

        self.calcu_attention = tanh(T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention)
        #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + T.dot(self.feature_layer.output,w_attention_feature) + b_attention)
        #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention)

        self.attention = softmax(T.transpose(self.calcu_attention,axes=(1,0)))[0]

        self.out = self.attention

        self.get_out = theano.function(inputs=[self.zp_x_pre,self.zp_x_post,self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature],outputs=[self.out],on_unused_input='warn')
        
        l1_norm_squared = sum([(w**2).sum() for w in self.params])
        l2_norm_squared = sum([(abs(w)).sum() for w in self.params])

        lmbda_l1 = 0.0
        #lmbda_l2 = 0.001
        lmbda_l2 = 0.0

        t = T.bvector()
        cost = -(T.log((self.out*t).sum())) + f_norm

        lr = T.scalar()
        
        updates = lasagne.updates.sgd(cost, self.params, lr)
        #updates = lasagne.updates.adadelta(cost, self.params)

        
        self.train_step = theano.function(
            inputs=[self.zp_x_pre,self.zp_x_post,self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc,self.feature,t,lr],
            outputs=[cost],
            on_unused_input='warn',
            updates=updates)
Beispiel #39
0
    def __init__(
            self,
            rng,
            batchsize=100,
            activation=relu
    ):

        import char_load
        (num_sent, char_cnt, word_cnt, max_word_len, max_sen_len, \
         k_chr, k_wrd, x_chr, x_wrd, y) = char_load.read("tweets_clean.txt")

        dim_word = 30
        dim_char = 5
        cl_word = 300
        cl_char = 50
        k_word = k_wrd
        k_char = k_chr

        data_train_word, \
        data_test_word, \
        data_train_char, \
        data_test_char, \
        target_train, \
        target_test \
            = train_test_split(x_wrd, x_chr, y, random_state=1234, test_size=0.1)

        x_train_word = theano.shared(np.asarray(data_train_word, dtype='int16'), borrow=True)
        x_train_char = theano.shared(np.asarray(data_train_char, dtype='int16'), borrow=True)
        y_train = theano.shared(np.asarray(target_train, dtype='int8'), borrow=True)
        x_test_word = theano.shared(np.asarray(data_test_word, dtype='int16'), borrow=True)
        x_test_char = theano.shared(np.asarray(data_test_char, dtype='int16'), borrow=True)
        y_test = theano.shared(np.asarray(target_test, dtype='int8'), borrow=True)

        self.n_train_batches = x_train_word.get_value(borrow=True).shape[0] / batchsize
        self.n_test_batches = x_test_word.get_value(borrow=True).shape[0] / batchsize

        """symbol definition"""
        index = T.iscalar()
        x_wrd = T.wmatrix('x_wrd')
        x_chr = T.wtensor3('x_chr')
        y = T.bvector('y')
        train = T.iscalar('train')

        """network definition"""
        layer_char_embed_input = x_chr  # .reshape((batchsize, max_sen_len, max_word_len))

        layer_char_embed = EmbedIDLayer(
            rng,
            layer_char_embed_input,
            n_input=char_cnt,
            n_output=dim_char
        )

        layer1_input = layer_char_embed.output.reshape(
            (batchsize * max_sen_len, 1, max_word_len, dim_char)
        )

        layer1 = ConvolutionalLayer(
            rng,
            layer1_input,
            filter_shape=(cl_char, 1, k_char, dim_char),  # cl_charフィルタ数
            image_shape=(batchsize * max_sen_len, 1, max_word_len, dim_char)
        )

        layer2 = MaxPoolingLayer(
            layer1.output,
            poolsize=(max_word_len - k_char + 1, 1)
        )

        layer_word_embed_input = x_wrd  # .reshape((batchsize, max_sen_len))

        layer_word_embed = EmbedIDLayer(
            rng,
            layer_word_embed_input,
            n_input=word_cnt,
            n_output=dim_word
        )

        layer3_word_input = layer_word_embed.output.reshape((batchsize, 1, max_sen_len, dim_word))
        layer3_char_input = layer2.output.reshape((batchsize, 1, max_sen_len, cl_char))

        layer3_input = T.concatenate(
            [layer3_word_input,
             layer3_char_input],
            axis=3
        )  # .reshape((batchsize, 1, max_sen_len, dim_word+cl_char))

        layer3 = ConvolutionalLayer(
            rng,
            layer3_input,
            filter_shape=(cl_word, 1, k_word, dim_word + cl_char),  # 1は入力チャネル数
            image_shape=(batchsize, 1, max_sen_len, dim_word + cl_char),
            activation=activation
        )

        layer4 = MaxPoolingLayer(
            layer3.output,
            poolsize=(max_sen_len - k_word + 1, 1)
        )

        layer5_input = layer4.output.reshape((batchsize, cl_word))

        layer5 = FullyConnectedLayer(
            rng,
            dropout(rng, layer5_input, train),
            n_input=cl_word,
            n_output=50,
            activation=activation
        )

        layer6_input = layer5.output

        layer6 = FullyConnectedLayer(
            rng,
            dropout(rng, layer6_input, train, p=0.1),
            n_input=50,
            n_output=2,
            activation=None
        )

        result = Result(layer6.output, y)
        loss = result.negative_log_likelihood()
        accuracy = result.accuracy()
        params = layer6.params \
                 + layer5.params \
                 + layer3.params \
                 + layer_word_embed.params \
                 + layer1.params \
                 + layer_char_embed.params
        updates = RMSprop(learning_rate=0.001, params=params).updates(loss)

        self.train_model = theano.function(
            inputs=[index],
            outputs=[loss, accuracy],
            updates=updates,
            givens={
                x_wrd: x_train_word[index * batchsize: (index + 1) * batchsize],
                x_chr: x_train_char[index * batchsize: (index + 1) * batchsize],
                y: y_train[index * batchsize: (index + 1) * batchsize],
                train: np.cast['int32'](1)
            }
        )

        self.test_model = theano.function(
            inputs=[index],
            outputs=[loss, accuracy],
            givens={
                x_wrd: x_test_word[index * batchsize: (index + 1) * batchsize],
                x_chr: x_test_char[index * batchsize: (index + 1) * batchsize],
                y: y_test[index * batchsize: (index + 1) * batchsize],
                train: np.cast['int32'](0)
            }
        )
Beispiel #40
0
    def __init__(self, n_hidden, embedding_dimention=50):

        ##n_in: sequence lstm 的输入维度
        ##n_hidden: lstm for candi and zp 的隐层维度
        ##n_hidden_sequence: sequence lstm的隐层维度 因为要同zp的结合做dot,所以其维度要是n_hidden的2倍
        ##                   即 n_hidden_sequence = 2 * n_hidden
        self.params = []

        self.zp_x_pre = T.matrix("zp_x_pre")
        self.zp_x_post = T.matrix("zp_x_post")

        #self.zp_x_pre_dropout = _dropout_from_layer(self.zp_x_pre)
        #self.zp_x_post_dropout = _dropout_from_layer(self.zp_x_post)

        zp_nn_pre = GRU(embedding_dimention, n_hidden, self.zp_x_pre)
        #zp_nn_pre = LSTM(embedding_dimention,n_hidden,self.zp_x_pre_dropout)
        self.params += zp_nn_pre.params

        zp_nn_post = GRU(embedding_dimention, n_hidden, self.zp_x_post)
        #zp_nn_post = LSTM(embedding_dimention,n_hidden,self.zp_x_post_dropout)
        self.params += zp_nn_post.params

        self.zp_out = T.concatenate((zp_nn_pre.nn_out, zp_nn_post.nn_out))

        self.ZP_layer = Layer(n_hidden * 2, n_hidden * 2, self.zp_out, ReLU)

        self.zp_out_output = self.ZP_layer.output

        #self.zp_out_dropout = _dropout_from_layer(T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out)))

        self.get_zp_out = theano.function(
            inputs=[self.zp_x_pre, self.zp_x_post],
            outputs=[self.ZP_layer.output])

        ### get sequence output for NP ###
        self.np_x = T.tensor3("np_x")
        self.np_x_post = T.tensor3("np_x")
        self.np_x_pre = T.tensor3("np_x")

        #self.np_x_dropout = _dropout_from_layer(self.np_x)

        self.mask = T.matrix("mask")
        self.mask_pre = T.matrix("mask")
        self.mask_post = T.matrix("mask")

        self.np_nn_x = RNN_batch(embedding_dimention, n_hidden, self.np_x,
                                 self.mask)
        self.params += self.np_nn_x.params
        self.np_nn_pre = GRU_batch(embedding_dimention, n_hidden,
                                   self.np_x_pre, self.mask_pre)
        self.params += self.np_nn_pre.params
        self.np_nn_post = GRU_batch(embedding_dimention, n_hidden,
                                    self.np_x_post, self.mask_post)
        self.params += self.np_nn_post.params

        #self.np_nn_out = LSTM_batch(embedding_dimention,n_hidden*2,self.np_x,self.mask)
        #self.np_nn_out = LSTM_batch(embedding_dimention,n_hidden*2,self.np_x_dropout,self.mask)
        #self.params += self.np_nn_out.params

        #self.np_out = self.np_nn.nn_out
        self.np_nn_x_output = (self.np_nn_x.all_hidden).mean(axis=1)
        self.np_nn_post_output = self.np_nn_post.nn_out
        self.np_nn_pre_output = self.np_nn_pre.nn_out

        self.np_out = T.concatenate(
            (self.np_nn_x_output, self.np_nn_post_output,
             self.np_nn_pre_output),
            axis=1)

        self.NP_layer = Layer(n_hidden * 3, n_hidden * 2, self.np_out, ReLU)

        self.np_out_output = self.NP_layer.output

        self.np_x_head = T.transpose(self.np_x, axes=(1, 0, 2))[-1]

        self.get_np_head = theano.function(inputs=[self.np_x],
                                           outputs=[self.np_x_head])
        self.get_np = theano.function(inputs=[
            self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre,
            self.mask_post
        ],
                                      outputs=[self.np_out])
        self.get_np_out = theano.function(inputs=[
            self.np_x, self.np_x_pre, self.np_x_post, self.mask, self.mask_pre,
            self.mask_post
        ],
                                          outputs=[self.np_out_output])

        w_attention_zp, b_attention = init_weight(n_hidden * 2,
                                                  1,
                                                  pre="attention_hidden",
                                                  ones=False)
        self.params += [w_attention_zp, b_attention]

        w_attention_np, b_u = init_weight(n_hidden * 2,
                                          1,
                                          pre="attention_zp",
                                          ones=False)
        self.params += [w_attention_np]

        self.calcu_attention = tanh(
            T.dot(self.np_out_output, w_attention_np) +
            T.dot(self.zp_out_output, w_attention_zp) + b_attention)
        self.attention = softmax(T.transpose(self.calcu_attention,
                                             axes=(1, 0)))[0]
        self.get_attention = theano.function(inputs=[
            self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre,
            self.np_x_post, self.mask, self.mask_pre, self.mask_post
        ],
                                             outputs=[self.attention])

        new_zp = T.sum(self.attention[:, None] * self.np_x_head, axis=0)
        self.get_new_zp = theano.function(inputs=[
            self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre,
            self.np_x_post, self.mask, self.mask_pre, self.mask_post
        ],
                                          outputs=[new_zp])

        #### *** HOP *** ####
        self.w_hop_zp, self.b_hop_zp = init_weight(n_hidden * 2 +
                                                   embedding_dimention,
                                                   n_hidden * 2,
                                                   pre="hop_")
        self.params += [self.w_hop_zp, self.b_hop_zp]

        ## hop 1 ##
        self.zp_hop_1_init = T.concatenate(
            (zp_nn_pre.nn_out, zp_nn_post.nn_out, new_zp))
        self.zp_hop_1 = ReLU(
            T.dot(self.zp_hop_1_init, self.w_hop_zp) + self.b_hop_zp)

        self.calcu_attention_hop_1 = tanh(
            T.dot(self.np_out_output, w_attention_np) +
            T.dot(self.zp_hop_1, w_attention_zp) + b_attention)
        self.attention_hop_1 = softmax(
            T.transpose(self.calcu_attention_hop_1, axes=(1, 0)))[0]
        self.get_attention_hop_1 = theano.function(
            inputs=[
                self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre,
                self.np_x_post, self.mask, self.mask_pre, self.mask_post
            ],
            outputs=[self.attention_hop_1])

        self.out = self.attention_hop_1

        self.get_out = theano.function(inputs=[
            self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre,
            self.np_x_post, self.mask, self.mask_pre, self.mask_post
        ],
                                       outputs=[self.out])

        l1_norm_squared = sum([(w**2).sum() for w in self.params])
        l2_norm_squared = sum([(abs(w)).sum() for w in self.params])

        lmbda_l1 = 0.0
        #lmbda_l2 = 0.001
        lmbda_l2 = 0.0

        t = T.bvector()
        cost = -(T.log((self.out * t).sum()))
        #cost = -(T.log((self.out_dropout*t).sum()))
        #cost = 1-((self.out*t).sum())

        lr = T.scalar()
        #grads = T.grad(cost, self.params)
        #updates = [(param, param-lr*grad)
        #    for param, grad in zip(self.params, grads)]

        #updates = lasagne.updates.sgd(cost, self.params, lr)
        updates = lasagne.updates.adadelta(cost, self.params)

        self.train_step = theano.function(inputs=[
            self.zp_x_pre, self.zp_x_post, self.np_x, self.np_x_pre,
            self.np_x_post, self.mask, self.mask_pre, self.mask_post, t, lr
        ],
                                          outputs=[cost],
                                          on_unused_input='warn',
                                          updates=updates)
Beispiel #41
0
    def build_loss(self, env_spec, policy):
        obs = env_spec.observation_space.new_tensor_variable('obs',
                                                             extra_dims=1)
        next_obs = env_spec.observation_space.new_tensor_variable('next_obs',
                                                                  extra_dims=1)
        act = env_spec.action_space.new_tensor_variable('act', extra_dims=1)
        ret = T.vector('disc_n_return')
        term = T.bvector('terminal')
        if self.prioritized_replay:
            isw = T.vector('importance_sample_weights')

        z_np = np.linspace(self.V_min,
                           self.V_max,
                           policy.n_atoms,
                           dtype=theano.config.floatX)
        z = theano.shared(z_np)
        z_contracted = theano.shared(
            (self.discount**self.reward_horizon) * z_np)
        policy.incorporate_z(
            z)  # (policy sets n_atoms, but algo sets vmin,vmax)
        delta_z = (self.V_max - self.V_min) / (policy.n_atoms - 1)

        # Yeah this is difficult to read and know if it's right.
        # (tested it vs numpy loop and numpy vectorized form in another script)
        z_contracted_bc = z_contracted.dimshuffle('x', 0)  # (bc: broadcast)
        z_cntrct_term = (1 - term.dimshuffle(0, 'x')) * z_contracted_bc
        # z_cntrct_term is 2D tensor, with contracted z-values repeated for
        # each data point (each row), and zero'd wherever terminal is True
        ret_bc = ret.dimshuffle(0, 'x')
        z_next = T.clip(ret_bc + z_cntrct_term, self.V_min, self.V_max)
        # each row (data entry) in z_next had all z_values shifted by
        # corresponding return
        # must compare every pair of base z atom with next z atom
        z_next_bc = z_next.dimshuffle(0, 1, 'x')
        z_bc = z.dimshuffle('x', 'x', 0)
        abs_diff_on_delta = abs(z_next_bc - z_bc) / delta_z
        projection_coeffs = T.clip(1 - abs_diff_on_delta, 0, 1)  # (mostly 0's)
        # projection coefficients is a 3-D tensor.
        # dim-0: independent data entries (gets scanned/looped over in batched_dot)
        # dim-1: corresponds to z_next atoms (gets summed over in batched_dot)
        # dim-2: corresponds to base z atoms (becomes dim-1 after batched_dot)

        if self.double_dqn:
            next_act = policy.actions_sym(next_obs)
            next_Z = policy.target_Z_at_a_sym(next_obs, next_act)
        else:
            next_Z = policy.target_max_Z_sym(next_obs)
        # lower case z refers to the domain of atoms,
        # capital Z refers to the probabilities for given state and action
        # projected_next_Z = T.batched_dot(next_Z, projection_coeffs)
        # NOTE: use of batched_dot somehow breaks the gradient (Theano 0.9);
        # so, do the broadcasting and summing manually (until Theano 1.0)
        next_Z_bc = T.shape_padright(next_Z)
        next_Z_x_coeff = projection_coeffs * next_Z_bc
        projected_next_Z = next_Z_x_coeff.sum(axis=1)

        predicted_Z = policy.Z_at_a_sym(obs, act)
        predicted_Z = T.clip(predicted_Z, 1e-6, 1)  # (NaN-guard)
        losses = -T.sum(projected_next_Z * T.log(predicted_Z),
                        axis=1)  # CrossEnt

        if self.prioritized_replay:
            losses = isw * losses
        loss = T.mean(losses)

        projected_next_Z = T.clip(projected_next_Z, 1e-6, 1)  # (NaN-guard)
        KL_divs = T.sum(
            projected_next_Z * T.log(projected_next_Z / predicted_Z),
            axis=1,
        )
        KL_divs = T.clip(KL_divs, 1e-6, 1e6)  # avoid < 0 from NaN-guard

        input_list = [obs, next_obs, act, ret, term]
        if self.prioritized_replay:
            input_list.append(isw)

        return input_list, loss, KL_divs
Beispiel #42
0
    def fit(self, data, test=None, sample_store=10000000):
        '''
        Trains the network.

        Parameters
        --------
        data : pandas.DataFrame
            Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
            It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
        sample_store : int
            If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary).
            This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM.
            For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely).

        '''
        self.predict = None
        self.error_during_train = False
        itemids = data[self.item_key].unique()
        self.n_items = len(itemids)
        self.itemidmap = pd.Series(data=np.arange(self.n_items), index=itemids)
        data = pd.merge(data,
                        pd.DataFrame({
                            self.item_key: itemids,
                            'ItemIdx': self.itemidmap[itemids].values
                        }),
                        on=self.item_key,
                        how='inner')
        offset_sessions = self.init_data(data)
        if self.n_sample:
            pop = data.groupby(self.item_key).size()
            pop = pop[self.itemidmap.index.values].values**self.sample_alpha
            pop = pop.cumsum() / pop.sum()
            pop[-1] = 1
            if sample_store:
                generate_length = sample_store // self.n_sample
                if generate_length <= 1:
                    sample_store = 0
                    print('No example store was used')
                else:
                    neg_samples = self.generate_neg_samples(
                        pop, generate_length)
                    sample_pointer = 0
            else:
                print('No example store was used')
        X = T.ivector()
        Y = T.ivector()
        M = T.iscalar()
        R = T.bvector()
        H_new, Y_pred, sparams, full_params, sidxs = self.model(
            X, self.H, M, R, Y, self.dropout_p_hidden, self.dropout_p_embed)
        cost = (M / self.batch_size) * self.loss_function(Y_pred, M)
        params = [
            self.Wx if self.embedding or self.constrained_embedding else
            self.Wx[1:], self.Wh, self.Wrz, self.Bh
        ]
        updates = self.RMSprop(cost, params, full_params, sparams, sidxs)
        for i in range(len(self.H)):
            updates[self.H[i]] = H_new[i]
        train_function = function(inputs=[X, Y, M, R],
                                  outputs=cost,
                                  updates=updates,
                                  allow_input_downcast=True)
        base_order = np.argsort(
            data.groupby(self.session_key)[self.time_key].min().values
        ) if self.time_sort else np.arange(len(offset_sessions) - 1)
        data_items = data.ItemIdx.values
        for epoch in range(self.n_epochs):
            sc = time.clock()
            st = time.time()
            for i in range(len(self.layers)):
                self.H[i].set_value(np.zeros((self.batch_size, self.layers[i]),
                                             dtype=theano.config.floatX),
                                    borrow=True)
            c = []
            cc = []
            session_idx_arr = np.random.permutation(
                len(offset_sessions) -
                1) if self.train_random_order else base_order
            iters = np.arange(self.batch_size)
            maxiter = iters.max()
            start = offset_sessions[session_idx_arr[iters]]
            end = offset_sessions[session_idx_arr[iters] + 1]
            finished = False
            while not finished:
                minlen = (end - start).min()
                out_idx = data_items[start]
                for i in range(minlen - 1):
                    in_idx = out_idx
                    out_idx = data_items[start + i + 1]
                    if self.n_sample:
                        if sample_store:
                            if sample_pointer == generate_length:
                                neg_samples = self.generate_neg_samples(
                                    pop, generate_length)
                                sample_pointer = 0
                            sample = neg_samples[sample_pointer]
                            sample_pointer += 1
                        else:
                            sample = self.generate_neg_samples(pop, 1)
                        y = np.hstack([out_idx, sample])
                    else:
                        y = out_idx
                        if self.n_sample:
                            if sample_pointer == generate_length:
                                generate_samples()
                                sample_pointer = 0
                            sample_pointer += 1
                    reset = (start + i + 1 == end - 1)
                    cost = train_function(in_idx, y, len(iters), reset)
                    c.append(cost)
                    cc.append(len(iters))
                    if np.isnan(cost):
                        print(str(epoch) + ': NaN error!')
                        self.error_during_train = True
                        return
                start = start + minlen - 1
                finished_mask = (end - start <= 1)
                n_finished = finished_mask.sum()
                iters[finished_mask] = maxiter + np.arange(1, n_finished + 1)
                maxiter += n_finished
                valid_mask = (iters < len(offset_sessions) - 1)
                n_valid = valid_mask.sum()
                if (n_valid == 0) or (n_valid < 2 and self.n_sample == 0):
                    finished = True
                    break
                mask = finished_mask & valid_mask
                sessions = session_idx_arr[iters[mask]]
                start[mask] = offset_sessions[sessions]
                end[mask] = offset_sessions[sessions + 1]
                iters = iters[valid_mask]
                start = start[valid_mask]
                end = end[valid_mask]
                if n_valid < len(valid_mask):
                    for i in range(len(self.H)):
                        tmp = self.H[i].get_value(borrow=True)
                        tmp = tmp[valid_mask]
                        self.H[i].set_value(tmp, borrow=True)
            c = np.array(c)
            cc = np.array(cc)
            avgc = np.sum(c * cc) / np.sum(cc)
            if np.isnan(avgc):
                print('Epoch {}: NaN error!'.format(str(epoch)))
                self.error_during_train = True
                return
            print('Epoch{}\tloss: {:.6f}'.format(epoch, avgc), 'time: ',
                  (time.clock() - sc), 'c / ', (time.time() - st), 's')
Beispiel #43
0
    def _init_model(self, in_size, out_size, slot_sizes, db, \
            n_hid=10, learning_rate_sl=0.005, learning_rate_rl=0.005, batch_size=32, ment=0.1, \
            inputtype='full', sl='e2e', rl='e2e'):
        self.in_size = in_size
        self.out_size = out_size
        self.slot_sizes = slot_sizes
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid
        self.r_hid = self.n_hid
        self.sl = sl
        self.rl = rl

        table = db.table
        counts = db.counts
        m_unk = [db.inv_counts[s][-1] for s in dialog_config.inform_slots]
        prior = [db.priors[s] for s in dialog_config.inform_slots]
        unknown = [db.unks[s] for s in dialog_config.inform_slots]
        ids = [db.ids[s] for s in dialog_config.inform_slots]

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.bmatrix('tm'), \
                T.btensor3('am'), T.fvector('r')
        T_var, N_var = T.as_tensor_variable(table), T.as_tensor_variable(counts)
        db_index_var = T.imatrix('db')
        db_index_switch = T.bvector('s')

        l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask)
        flat_mask = T.reshape(turn_mask, (turn_mask.shape[0]*turn_mask.shape[1],1))

        def _smooth(p):
            p_n = p+EPS
            return p_n/(p_n.sum(axis=1)[:,np.newaxis])

        def _add_unk(p,m,N):
            # p: B x V, m- num missing, N- total, p0: 1 x V
            t_unk = T.as_tensor_variable(float(m)/N)
            ps = p*(1.-t_unk)
            return T.concatenate([ps, T.tile(t_unk, (ps.shape[0],1))], axis=1)

        def kl_divergence(p,q):
            p_n = _smooth(p)
            return -T.sum(q*T.log(p_n), axis=1)

        # belief tracking
        l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var)
        p_vars = []
        pu_vars = []
        phi_vars = []
        p_targets = []
        phi_targets = []
        hid_in_vars = []
        hid_out_vars = []
        bt_loss = T.as_tensor_variable(0.)
        kl_loss = []
        x_loss = []
        self.trackers = []
        for i,s in enumerate(dialog_config.inform_slots):
            hid_in = T.fmatrix('h')
            l_rnn = L.GRULayer(l_in, self.r_hid, hid_init=hid_in,  \
                    mask_input=l_mask_in,
                    grad_clipping=10.) # B x H x D
            l_b_in = L.ReshapeLayer(l_rnn, 
                    (input_var.shape[0]*input_var.shape[1], self.r_hid)) # BH x D
            hid_out = L.get_output(l_rnn)[:,-1,:]

            p_targ = T.ftensor3('p_target_'+s)
            p_t = T.reshape(p_targ, 
                    (p_targ.shape[0]*p_targ.shape[1],self.slot_sizes[i]))
            phi_targ = T.fmatrix('phi_target'+s)
            phi_t = T.reshape(phi_targ, (phi_targ.shape[0]*phi_targ.shape[1], 1))

            l_b = L.DenseLayer(l_b_in, self.slot_sizes[i], 
                    nonlinearity=lasagne.nonlinearities.softmax)
            l_phi = L.DenseLayer(l_b_in, 1, 
                    nonlinearity=lasagne.nonlinearities.sigmoid)

            phi = T.clip(L.get_output(l_phi), 0.01, 0.99)
            p = L.get_output(l_b)
            p_u = _add_unk(p, m_unk[i], db.N)
            kl_loss.append(T.sum(flat_mask.flatten()*kl_divergence(p, p_t))/T.sum(flat_mask))
            x_loss.append(T.sum(flat_mask*lasagne.objectives.binary_crossentropy(phi,phi_t))/
                    T.sum(flat_mask))
            bt_loss += kl_loss[-1] + x_loss[-1]

            p_vars.append(p)
            pu_vars.append(p_u)
            phi_vars.append(phi)
            p_targets.append(p_targ)
            phi_targets.append(phi_targ)
            hid_in_vars.append(hid_in)
            hid_out_vars.append(hid_out)
            self.trackers.append(l_b)
            self.trackers.append(l_phi)
        self.bt_params = L.get_all_params(self.trackers)

        def check_db(pv, phi, Tb, N):
            O = T.alloc(0.,pv[0].shape[0],Tb.shape[0]) # BH x T.shape[0]
            for i,p in enumerate(pv):
                p_dc = T.tile(phi[i], (1, Tb.shape[0]))
                O += T.log(p_dc*(1./db.table.shape[0]) + \
                        (1.-p_dc)*(p[:,Tb[:,i]]/N[np.newaxis,:,i]))
            Op = T.exp(O)#+EPS # BH x T.shape[0]
            Os = T.sum(Op, axis=1)[:,np.newaxis] # BH x 1
            return Op/Os

        def entropy(p):
            p = _smooth(p)
            return -T.sum(p*T.log(p), axis=-1)

        def weighted_entropy(p,q,p0,unks,idd):
            w = T.dot(idd,q.transpose()) # Pi x BH
            u = p0[np.newaxis,:]*(q[:,unks].sum(axis=1)[:,np.newaxis]) # BH x Pi
            p_tilde = w.transpose()+u
            return entropy(p_tilde)

        p_db = check_db(pu_vars, phi_vars, T_var, N_var) # BH x T.shape[0]
        
        if inputtype=='entropy':
            H_vars = [weighted_entropy(pv,p_db,prior[i],unknown[i],ids[i]) \
                    for i,pv in enumerate(p_vars)]
            H_db = entropy(p_db)
            phv = [ph[:,0] for ph in phi_vars]
            t_in = T.stacklists(H_vars+phv+[H_db]).transpose() # BH x 2M+1
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x 2M+1
            l_in_pol = L.InputLayer(
                    shape=(None,None,2*len(dialog_config.inform_slots)+1), \
                    input_var=t_in_resh)
        else:
            in_reshaped = T.reshape(input_var, 
                    (input_var.shape[0]*input_var.shape[1], \
                    input_var.shape[2]))
            prev_act = in_reshaped[:,-len(dialog_config.inform_slots):]
            t_in = T.concatenate(pu_vars+phi_vars+[p_db,prev_act], 
                    axis=1) # BH x D-sum+A
            t_in_resh = T.reshape(t_in, (turn_mask.shape[0], turn_mask.shape[1], \
                    t_in.shape[1])) # B x H x D-sum
            l_in_pol = L.InputLayer(shape=(None,None,sum(self.slot_sizes)+ \
                    3*len(dialog_config.inform_slots)+ \
                    table.shape[0]), input_var=t_in_resh)

        pol_in = T.fmatrix('pol-h')
        l_pol_rnn = L.GRULayer(l_in_pol, n_hid, hid_init=pol_in, 
                mask_input=l_mask_in,
                grad_clipping=10.) # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:,-1,:]
        l_den_in = L.ReshapeLayer(l_pol_rnn, 
                (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D
        l_out = L.DenseLayer(l_den_in, self.out_size, \
                nonlinearity=lasagne.nonlinearities.softmax) # BH x A

        self.network = l_out
        self.pol_params = L.get_all_params(self.network)
        self.params = self.bt_params + self.pol_params

        # db loss
        p_db_reshaped = T.reshape(p_db, (turn_mask.shape[0],turn_mask.shape[1],table.shape[0]))
        p_db_final = p_db_reshaped[:,-1,:] # B x T.shape[0]
        p_db_final = _smooth(p_db_final)
        ix = T.tile(T.arange(p_db_final.shape[0]),(db_index_var.shape[1],1)).transpose()
        sample_probs = p_db_final[ix,db_index_var] # B x K
        if dialog_config.SUCCESS_MAX_RANK==1:
            log_db_probs = T.log(sample_probs).sum(axis=1)
        else:
            cum_probs,_ = theano.scan(fn=lambda x, prev: x+prev, \
                    outputs_info=T.zeros_like(sample_probs[:,0]), \
                    sequences=sample_probs[:,:-1].transpose())
            cum_probs = T.clip(cum_probs.transpose(), 0., 1.-1e-5) # B x K-1
            log_db_probs = T.log(sample_probs).sum(axis=1) - T.log(1.-cum_probs).sum(axis=1) # B
        log_db_probs = log_db_probs * db_index_switch

        # rl
        probs = L.get_output(self.network) # BH x A
        probs = _smooth(probs)
        out_probs = T.reshape(probs, (turn_mask.shape[0],turn_mask.shape[1],self.out_size)) # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs*act_mask).sum(axis=2) # B x H
        ep_probs = (act_probs*turn_mask).sum(axis=1) # B
        H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B
        self.act_loss = -T.mean(ep_probs*reward_var)
        self.db_loss = -T.mean(log_db_probs*reward_var)
        self.reg_loss = -T.mean(ment*H_probs)
        self.loss = self.act_loss + self.db_loss + self.reg_loss

        self.inps = [input_var, turn_mask, act_mask, reward_var, db_index_var, db_index_switch, \
                pol_in] + hid_in_vars
        self.obj_fn = theano.function(self.inps, self.loss, on_unused_input='warn')
        self.act_fn = theano.function([input_var,turn_mask,pol_in]+hid_in_vars, \
                [out_probs,p_db,pol_out]+pu_vars+phi_vars+hid_out_vars, on_unused_input='warn')
        self.debug_fn = theano.function(self.inps, [probs, p_db, self.loss], on_unused_input='warn')
        self._rl_train_fn(self.learning_rate)

        ## sl
        sl_loss = 0. + bt_loss - T.mean(ep_probs) 

        if self.sl=='e2e':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        elif self.sl=='bel':
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.bt_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)
        else:
            sl_updates = lasagne.updates.rmsprop(sl_loss, self.pol_params, \
                    learning_rate=learning_rate_sl, epsilon=1e-4)
            sl_updates_with_mom = lasagne.updates.apply_momentum(sl_updates)

        sl_inps = [input_var, turn_mask, act_mask, pol_in] + p_targets + phi_targets + hid_in_vars
        self.sl_train_fn = theano.function(sl_inps, [sl_loss]+kl_loss+x_loss, updates=sl_updates, \
                on_unused_input='warn')
        self.sl_obj_fn = theano.function(sl_inps, sl_loss, on_unused_input='warn')
Beispiel #44
0
    def __init__(self,
                 ne,
                 de,
                 cs,
                 nh,
                 nc,
                 L2_reg=0.0,
                 rng=np.random.RandomState()):
        self.nc = nc
        self.hiddenLayer = Layer(de * cs, nh, rng=rng)
        self.outputLayer = Layer(nh, nc)
        self.emb = theano.shared(
            rng.normal(loc=0.0, scale=0.01,
                       size=(ne, de)).astype(theano.config.floatX))
        A = rng.normal(loc=0.0, scale=0.01,
                       size=(nc, nc)).astype(theano.config.floatX)
        self.A = theano.shared(value=A, name='A', borrow=True)

        self.params = self.hiddenLayer.params + self.outputLayer.params + [
            self.emb, self.A
        ]
        self.names = ['Wh', 'bh', 'w', 'b', 'emb', 'A']

        idxs = T.imatrix('idxs')
        x = self.emb[idxs].reshape((idxs.shape[0], de * cs))
        y = T.bvector('y')
        ans = T.bvector('ans')

        INF = 1e9
        result, updates1 = theano.scan(fn=self.one_step,
                                       sequences=x,
                                       outputs_info=[
                                           theano.shared(0.0),
                                           theano.shared(-INF),
                                           theano.shared(-INF),
                                           theano.shared(-INF), None, None,
                                           None, None
                                       ])
        self.decode = theano.function(inputs=[idxs],
                                      outputs=result,
                                      updates=updates1)

        score, updates2 = theano.scan(fn=self.two_step,
                                      sequences=[
                                          x,
                                          dict(input=y, taps=[-1, 0]),
                                          dict(input=ans, taps=[-1, 0])
                                      ],
                                      outputs_info=theano.shared(0.0))

        cost = score[-1]
        gradients = T.grad(cost, self.params)
        lr = T.scalar('lr')
        for p, g in zip(self.params, gradients):
            updates2[p] = p + lr * g

        self.fit = theano.function(inputs=[idxs, y, ans, lr],
                                   outputs=cost,
                                   updates=updates2)
        self.normalize = theano.function(
            inputs=[],
            updates={
                self.emb:
                self.emb / T.sqrt(
                    (self.emb**2).sum(axis=1)).dimshuffle(0, 'x')
            })
Beispiel #45
0
 def inputs(self):
     return {
         "call_type": tensor.bvector("call_type"),
         "origin_call": tensor.ivector("origin_call"),
         "origin_stand": tensor.bvector("origin_stand"),
         "taxi_id": tensor.wvector("taxi_id"),
         "timestamp": tensor.ivector("timestamp"),
         "day_type": tensor.bvector("day_type"),
         "missing_data": tensor.bvector("missing_data"),
         "latitude": tensor.matrix("latitude"),
         "longitude": tensor.matrix("longitude"),
         "destination_latitude": tensor.vector("destination_latitude"),
         "destination_longitude": tensor.vector("destination_longitude"),
         "travel_time": tensor.ivector("travel_time"),
         "first_k_latitude": tensor.matrix("first_k_latitude"),
         "first_k_longitude": tensor.matrix("first_k_longitude"),
         "last_k_latitude": tensor.matrix("last_k_latitude"),
         "last_k_longitude": tensor.matrix("last_k_longitude"),
         "input_time": tensor.ivector("input_time"),
         "week_of_year": tensor.bvector("week_of_year"),
         "day_of_week": tensor.bvector("day_of_week"),
         "qhour_of_day": tensor.bvector("qhour_of_day"),
         "candidate_call_type": tensor.bvector("candidate_call_type"),
         "candidate_origin_call": tensor.ivector("candidate_origin_call"),
         "candidate_origin_stand": tensor.bvector("candidate_origin_stand"),
         "candidate_taxi_id": tensor.wvector("candidate_taxi_id"),
         "candidate_timestamp": tensor.ivector("candidate_timestamp"),
         "candidate_day_type": tensor.bvector("candidate_day_type"),
         "candidate_missing_data": tensor.bvector("candidate_missing_data"),
         "candidate_latitude": tensor.matrix("candidate_latitude"),
         "candidate_longitude": tensor.matrix("candidate_longitude"),
         "candidate_destination_latitude": tensor.vector("candidate_destination_latitude"),
         "candidate_destination_longitude": tensor.vector("candidate_destination_longitude"),
         "candidate_travel_time": tensor.ivector("candidate_travel_time"),
         "candidate_first_k_latitude": tensor.matrix("candidate_first_k_latitude"),
         "candidate_first_k_longitude": tensor.matrix("candidate_first_k_longitude"),
         "candidate_last_k_latitude": tensor.matrix("candidate_last_k_latitude"),
         "candidate_last_k_longitude": tensor.matrix("candidate_last_k_longitude"),
         "candidate_input_time": tensor.ivector("candidate_input_time"),
         "candidate_week_of_year": tensor.bvector("candidate_week_of_year"),
         "candidate_day_of_week": tensor.bvector("candidate_day_of_week"),
         "candidate_qhour_of_day": tensor.bvector("candidate_qhour_of_day"),
     }
Beispiel #46
0
    def __init__(self,
            input,
            in_layer_shape,
            layer2_in = 1000,
            n_out = 11,
            use_adagrad = True,
            patch_size=64,
            activation=NeuralActivations.Rectifier,
            layer1_nout=11,
            exp_id=1,
            quiet=False,
            n_classes=11,
            save_file=None,
            mem_alloc="CPU",
            momentum=1.,
            enable_standardization=False,
            rng=None):
        """
        Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType

        :param input: symbolic variable that describes the input of the architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in which the datapoints lie.

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type in_layer_shape: list
        :param in_layer_shape: the shape of the first layer - format is :
            (no of patches, no of pixels per patch, no of batches, number of
            hidden units for locally connected hidden layer 1)

        :type layer2_in: list
        :param layer2_in: No of hidden units in the second hidden layer.

        :type shared_weights: use shared weights across the image
        :param shared_weights: boolean parameter to enable/disable the usage of
        shared weights

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in which
        the labels lie.
        """

        self.input = input

        if rng == None:
            rng = numpy.random.RandomState(1234)

        self.monitor = Monitor()

        self.learning_rate = 0.001

        self.exp_id = exp_id

        self.y = T.bvector('y')  # the labels are presented as 1D vector of int32
        self.mem_alloc = mem_alloc

        self.rng = rng
        self.ds = Dataset()
        self.n_out = n_out
        self.momentum = momentum
        self.save_file = save_file

        self.in_layer_shape = in_layer_shape
        self.layer2_in = layer2_in
        self.patch_size = patch_size
        self.layer1_nout = layer1_nout

        self.locally_connected_layer = None
        self.fully_connected_layer = None
        self.activation = activation
        self.n_hiddens_layer2 = (self.layer1_nout * in_layer_shape[0], layer2_in)
        self.n_classes = n_classes
        self.state = "train"
        self.enable_standardization = enable_standardization

        self.out_dir = "out/"
        self.grads = []
        self.test_scores = []

        #Whether to turn on or off the messages.
        self.quiet = quiet
        self.test_set_x = None
        self.valid_set_x = None
        self.test_set_y = None
        self.valid_set_y = None

        self.setup_hidden_layers(activation, in_layer_shape,
        self.n_hiddens_layer2, n_out)

        self.use_adagrad = use_adagrad

        #Error for patches with object in it:
        self.obj_patch_error_percent = 0
Beispiel #47
0
# the W matrix of the inputVectors as used in [1]
targetEmbeddings = theano.shared(
    np.random.uniform(-1, 1, (vocabSize, embeddingSize)))
# the W' matrix of the outputVectors as used in [1]
contextEmbeddings = theano.shared(
    np.random.normal(scale=1.0 / np.sqrt(vocabSize),
                     size=(embeddingSize, vocabSize)))

# A |batchSize x 2| dimensional matrix, having (traget, context) pairs for
# a batch (including) -ve samples. This is the input to the training function .
targetContext = T.imatrix()

# the |batchSize x 1| vector, trainig labels (also an input to the training
# function), whether the context word matches the target word or not
isContext = T.bvector()

batchMatchScores = []

for i in range(batchSize):
    matchScore = T.dot(targetEmbeddings[targetContext[i][0], :],
                       contextEmbeddings[:, targetContext[i][1]])
    batchMatchScores.append(matchScore)

objective = isContext * T.log(T.nnet.sigmoid(batchMatchScores)) + (
    1 - isContext) * T.log(1 - T.nnet.sigmoid(batchMatchScores))

loss = -T.mean(objective)

# TRAINING FUNCTION
from lasagne.updates import nesterov_momentum
Beispiel #48
0
			inputs  = [],
			outputs = T.mean(T.neq(T.argmax(predictions, axis=1), Y)),
			updates = updates,
			givens  = {
				X: data,
				Y: labels,
			}
		)
	return train_model



if __name__ == '__main__':
	print "Setting up memory..."
	X = T.bmatrix('X')
	Y = T.bvector('Y')
	Ws_char_to_hidden   = [ U.create_shared(U.initial_weights(CHARACTERS,HIDDEN),name='yeah%d'%i) for i in xrange(CONTEXT) ]
	b_hidden            = U.create_shared(U.initial_weights(HIDDEN))
	W_hidden_to_hidden  = U.create_shared(U.initial_weights(HIDDEN,HIDDEN))
	W_hidden_to_predict = U.create_shared(U.initial_weights(HIDDEN,CHARACTERS))
	b_predict           = U.create_shared(U.initial_weights(CHARACTERS))
	tunables = Ws_char_to_hidden + [
			b_hidden, 
			W_hidden_to_hidden,
			W_hidden_to_predict,
			b_predict
		]

	print "Constructing graph..."
	hidden_inputs  = make_hidden_inputs(X,Ws_char_to_hidden,b_hidden)
	hidden_outputs = make_hidden_outputs(hidden_inputs,W_hidden_to_hidden)
Beispiel #49
0

def policy_network(state):
    input_state = InputLayer(input_var=state, shape=(None, n_input))

    dense_1 = DenseLayer(input_state, num_units=n_input, nonlinearity=tanh)

    dense_2 = DenseLayer(dense_1, num_units=n_input, nonlinearity=tanh)

    probs = DenseLayer(dense_2, num_units=n_output, nonlinearity=softmax)

    return probs


X_state = T.fmatrix()
X_action = T.bvector()
X_reward = T.fvector()

X_action_hot = to_one_hot(X_action, n_output)

prob_values = policy_network(X_state)

policy_ = get_output(prob_values)
policy = theano.function(inputs=[X_state],
                         outputs=policy_,
                         allow_input_downcast=True)

loss = categorical_crossentropy(policy_, X_action_hot) * X_reward
loss = loss.mean()

params = get_all_params(prob_values)
Beispiel #50
0
    def __init__(self, n_hidden, embedding_dimention=50, feature_dimention=61):

        ##n_in: sequence lstm 的输入维度
        ##n_hidden: lstm for candi and zp 的隐层维度

        self.params = []

        self.w_embedding = init_weight_file(args.embedding,
                                            args.embedding_dimention)
        self.params.append(self.w_embedding)

        self.zp_x_pre_index = T.imatrix("zp_x_pre")
        self.zp_x_post_index = T.imatrix("zp_x_post")

        zp_x_pre_newshape = (T.shape(self.zp_x_pre_index)[0],
                             args.embedding_dimention)
        self.embedding_sub_zp_pre = self.w_embedding[
            self.zp_x_pre_index.flatten()]
        self.zp_x_pre = T.reshape(self.embedding_sub_zp_pre, zp_x_pre_newshape)

        zp_x_post_newshape = (T.shape(self.zp_x_post_index)[0],
                              args.embedding_dimention)
        self.embedding_sub_zp_post = self.w_embedding[
            self.zp_x_post_index.flatten()]
        self.zp_x_post = T.reshape(self.embedding_sub_zp_post,
                                   zp_x_post_newshape)

        zp_nn_pre = LSTM(embedding_dimention, n_hidden, self.zp_x_pre)
        self.params += zp_nn_pre.params

        zp_nn_post = LSTM(embedding_dimention, n_hidden, self.zp_x_post)
        self.params += zp_nn_post.params

        danwei = theano.shared(np.eye(8, dtype=theano.config.floatX))

        H_pre = zp_nn_pre.all_hidden
        H_post = zp_nn_post.all_hidden

        Ws1_pre, heihei = init_weight(n_hidden,
                                      n_hidden,
                                      pre="Ws1_pre_zp",
                                      ones=False)
        Ws2_pre, heihei = init_weight(8,
                                      n_hidden,
                                      pre="Ws2_pre_zp",
                                      ones=False)
        self.params += [Ws1_pre, Ws2_pre]

        A_pre = softmax(T.dot(Ws2_pre, T.dot(Ws1_pre, T.transpose(H_pre))))

        P_pre = T.dot(A_pre, T.transpose(A_pre)) - danwei
        #norm_pre, _ = theano.scan(lambda i, tmp: T.dot(P_pre[i], P_pre[i]) + tmp,
        #          sequences = T.arange(P_pre.shape[0]),
        #          outputs_info = np.asarray(0., dtype=theano.config.floatX))
        #f_norm_pre = T.sum(norm_pre[-1])
        f_norm_pre = (P_pre**2).sum()
        zp_out_pre = T.mean(T.dot(A_pre, H_pre), axis=0)

        Ws1_post, heihei = init_weight(n_hidden,
                                       n_hidden,
                                       pre="Ws1_post_zp",
                                       ones=False)
        Ws2_post, heihei = init_weight(8,
                                       n_hidden,
                                       pre="Ws2_post_zp",
                                       ones=False)
        self.params += [Ws1_post, Ws2_post]
        A_post = softmax(T.dot(Ws2_post, T.dot(Ws1_post, T.transpose(H_post))))

        P_post = T.dot(A_post, T.transpose(A_post)) - danwei
        #norm_post, _ = theano.scan(lambda i, tmp: T.dot(P_post[i], P_post[i]) + tmp,
        #          sequences = T.arange(P_post.shape[0]),
        #          outputs_info = np.asarray(0., dtype=theano.config.floatX))
        #f_norm_post = T.sum(norm_post[-1])
        f_norm_post = (P_post**2).sum()

        zp_out_post = T.mean(T.dot(A_post, H_post), axis=0)

        f_norm = f_norm_pre + f_norm_post

        #self.zp_out = T.concatenate((zp_nn_pre.nn_out,zp_nn_post.nn_out))
        self.zp_out = T.concatenate((zp_out_pre, zp_out_post))

        self.zp_out_output = self.zp_out

        ### get sequence output for NP ###
        self.np_x_post_index = T.itensor3("np_x")
        self.np_x_postc_index = T.itensor3("np_x")
        self.np_x_pre_index = T.itensor3("np_x")
        self.np_x_prec_index = T.itensor3("np_x")

        np_x_post_newshape = (T.shape(self.np_x_post_index)[0],
                              T.shape(self.np_x_post_index)[1],
                              args.embedding_dimention)
        self.embedding_sub_np_x_post = self.w_embedding[
            self.np_x_post_index.flatten()]
        self.np_x_post = T.reshape(self.embedding_sub_np_x_post,
                                   np_x_post_newshape)

        np_x_postc_newshape = (T.shape(self.np_x_postc_index)[0],
                               T.shape(self.np_x_postc_index)[1],
                               args.embedding_dimention)
        self.embedding_sub_np_x_postc = self.w_embedding[
            self.np_x_postc_index.flatten()]
        self.np_x_postc = T.reshape(self.embedding_sub_np_x_postc,
                                    np_x_postc_newshape)

        np_x_pre_newshape = (T.shape(self.np_x_pre_index)[0],
                             T.shape(self.np_x_pre_index)[1],
                             args.embedding_dimention)
        self.embedding_sub_np_x_pre = self.w_embedding[
            self.np_x_pre_index.flatten()]
        self.np_x_pre = T.reshape(self.embedding_sub_np_x_pre,
                                  np_x_pre_newshape)

        np_x_prec_newshape = (T.shape(self.np_x_prec_index)[0],
                              T.shape(self.np_x_prec_index)[1],
                              args.embedding_dimention)
        self.embedding_sub_np_x_prec = self.w_embedding[
            self.np_x_prec_index.flatten()]
        self.np_x_prec = T.reshape(self.embedding_sub_np_x_prec,
                                   np_x_prec_newshape)

        self.mask_pre = T.matrix("mask")
        self.mask_prec = T.matrix("mask")

        self.mask_post = T.matrix("mask")
        self.mask_postc = T.matrix("mask")

        self.np_nn_pre = sub_LSTM_batch(embedding_dimention, n_hidden,
                                        self.np_x_pre, self.np_x_prec,
                                        self.mask_pre, self.mask_prec)
        self.params += self.np_nn_pre.params
        self.np_nn_post = sub_LSTM_batch(embedding_dimention, n_hidden,
                                         self.np_x_post, self.np_x_postc,
                                         self.mask_post, self.mask_postc)
        self.params += self.np_nn_post.params

        self.np_nn_post_output = self.np_nn_post.nn_out
        self.np_nn_pre_output = self.np_nn_pre.nn_out

        self.np_out = T.concatenate(
            (self.np_nn_post_output, self.np_nn_pre_output), axis=1)

        np_nn_f = LSTM(n_hidden * 2, n_hidden * 2, self.np_out)
        self.params += np_nn_f.params
        np_nn_b = LSTM(n_hidden * 2, n_hidden * 2, self.np_out[::-1])
        self.params += np_nn_b.params

        self.bi_np_out = T.concatenate(
            (np_nn_f.all_hidden, np_nn_b.all_hidden[::-1]), axis=1)

        self.np_out_output = self.bi_np_out
        #self.get_np_out = theano.function(inputs=[self.np_x_pre,self.np_x_prec,self.np_x_post,self.np_x_postc,self.mask_pre,self.mask_prec,self.mask_post,self.mask_postc],outputs=[self.np_out_output])

        #self.feature = T.matrix("feature")
        #self.feature_layer = Layer(feature_dimention,n_hidden,self.feature,repre_active)
        #self.params += self.feature_layer.params

        w_attention_zp, b_attention = init_weight(n_hidden * 2,
                                                  1,
                                                  pre="attention_zp",
                                                  ones=False)
        self.params += [w_attention_zp, b_attention]

        w_attention_np, b_u = init_weight(n_hidden * 2,
                                          1,
                                          pre="attention_np",
                                          ones=False)
        #self.params += [w_attention_np]

        w_attention_np_rnn, b_u = init_weight(n_hidden * 4,
                                              1,
                                              pre="attention_np_rnn",
                                              ones=False)
        self.params += [w_attention_np_rnn]

        #np_out_dropout = _dropout_from_layer(self.np_out_output)
        #zp_out_dropout = _dropout_from_layer(self.zp_out_output)
        #np_dropout = _dropout_from_layer(self.np_out)

        #self.calcu_attention_dropout = tanh(T.dot(np_out_dropout,w_attention_np_rnn) + T.dot(zp_out_dropout,w_attention_zp) + T.dot(np_dropout,w_attention_np) + b_attention)

        #self.calcu_attention = tanh(T.dot(self.np_out_output,w_attention_np_rnn) + T.dot(self.zp_out_output,w_attention_zp) + T.dot(self.np_out,w_attention_np) + b_attention)
        self.calcu_attention = tanh(
            T.dot(self.np_out_output, w_attention_np_rnn) +
            T.dot(self.zp_out_output, w_attention_zp) + b_attention)

        self.attention = softmax(T.transpose(self.calcu_attention,
                                             axes=(1, 0)))[0]
        #self.attention_dropout = softmax(T.transpose(self.calcu_attention_dropout,axes=(1,0)))[0]

        self.out = self.attention
        #self.out_dropout = self.attention_dropout

        self.get_out = theano.function(inputs=[
            self.zp_x_pre_index, self.zp_x_post_index, self.np_x_pre_index,
            self.np_x_prec_index, self.np_x_post_index, self.np_x_postc_index,
            self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc
        ],
                                       outputs=[self.out],
                                       on_unused_input='warn')

        l1_norm_squared = sum([(w**2).sum() for w in self.params])
        l2_norm_squared = sum([(abs(w)).sum() for w in self.params])

        lmbda_l1 = 0.0
        #lmbda_l2 = 0.001
        lmbda_l2 = 0.0

        t = T.bvector()
        cost = -(T.log((self.out * t).sum())) + f_norm
        #cost = -(T.log((self.out_dropout*t).sum()))

        lr = T.scalar()

        updates = lasagne.updates.sgd(cost, self.params, lr)
        #updates = lasagne.updates.adadelta(cost, self.params)

        self.train_step = theano.function(inputs=[
            self.zp_x_pre_index, self.zp_x_post_index, self.np_x_pre_index,
            self.np_x_prec_index, self.np_x_post_index, self.np_x_postc_index,
            self.mask_pre, self.mask_prec, self.mask_post, self.mask_postc, t,
            lr
        ],
                                          outputs=[cost],
                                          on_unused_input='warn',
                                          updates=updates)