Exemple #1
0
    def _build_baseline(self, emb_size, n_observations, lstm_n_cells, o, r, lr):
        b_input = Embedding(name="bemb",
                            size=emb_size,
                            n_features=n_observations,
                            input=o)

        b_lstm = LstmRecurrent(name="blstm",
                               size=lstm_n_cells,
                               seq_output=True,
                               out_cells=False,
                               peepholes=False,
                               output_initial_state=False,
                               p_drop=0.0)
        b_lstm.connect(b_input)

        b_out = MLP([1],
                       ['linear'],
                       [0.0      ],
                       name="mlpb")
        b_out.connect(b_lstm)

        b = b_out.output()

        b = b.reshape((b.shape[0], b.shape[1], ))

        params = b_out.get_params()
        loss = ((b - r)**2).sum()

        d_loss = theano.grad(loss, params)

        upd = []
        for p, dp in zip(params, d_loss):
            upd.append((p, p - lr * dp))

        self.blearn = theano.function([o, r, lr], loss, updates=upd)

        return b
Exemple #2
0
    def _build_model_o(self, emb_size, n_observations, lstm_n_cells, oclf_n_hidden, oclf_n_layers, n_actions, oclf_activation):
        o = tt.imatrix(name='o')  # Dimensions: (time, seq_id)
        a = tt.imatrix(name='a')
        r = tt.matrix(name='r')
        lr = tt.scalar(name='lr')

        b = self._build_baseline(emb_size, n_observations, 10, o, r, lr)

        l_input = Embedding(name="emb",
                            size=emb_size,
                            n_features=n_observations,
                            input=o)
        prev_layer = l_input


        l_lstm = LstmRecurrent(name="lstm",
                               size=lstm_n_cells,
                               seq_output=True,
                               out_cells=False,
                               peepholes=False,
                               output_initial_state=False,
                               p_drop=0.0)
        l_lstm.connect(prev_layer)
        prev_layer = l_lstm

        l_action = MLP([oclf_n_hidden  ] * oclf_n_layers + [n_actions],
                       [oclf_activation] * oclf_n_layers + ['softmax'],
                       [0.0            ] * oclf_n_layers + [0.0      ],
                       name="mlp")
        l_action.connect(prev_layer)
        prev_layer = l_action

        pi = prev_layer.output()

        # Flatten the actions so that they are stacked in a matrix. Timestep, by timestep.
        orig_shape = pi.shape
        pi = tt.reshape(pi, (pi.shape[0] * pi.shape[1], pi.shape[2]))

        col_actions = tt.reshape(a, (pi.shape[0], ))
        col_rewards = tt.reshape(r, (pi.shape[0], ))
        col_b = tt.reshape(b, (pi.shape[0], ))

        params = [x for x in l_action.get_params()] # if not x.name.startswith('mlp_0')]
        print params

        lin_actions_p = pi[tt.arange(pi.shape[0]), (col_actions)] #+ 1e-7
        objective = tt.sum(tt.log(lin_actions_p) * (col_rewards - col_b)) # * 1.0 / orig_shape[1]

        pi = tt.reshape(pi, orig_shape)

        d_objective = theano.grad(objective, params)
        d_objective = clip_norms(d_objective, 5.0)

        upd = []
        for p, dp in zip(params, d_objective):
            upd.append((p, p + lr * dp))

        self.learn = theano.function([o, a, r, lr], [pi, objective, b] + d_objective, updates=upd)
        self.pi = theano.function([o], pi)

        self.params = params = [x for x in l_action.get_params()]

        self.orig_values = []
        for param in params:
            val = np.copy(param.get_value())
            self.orig_values.append(val)