Ejemplo n.º 1
0
 def get_loss_sarsa_function(self):
     #args
     self.states = T.matrix('state')
     self.actions = T.icol('action')
     self.next_states = T.matrix('next_state')
     self.next_actions = T.icol('next_action')
     self.rewards = T.col('reward')
     #q(s,a)
     actionmask = T.eq(
         T.arange(self.nactions).reshape((1, -1)),
         self.actions.reshape((-1, 1))).astype(theano.config.floatX)
     q_action = (get_output(self.network, self.states) *
                 actionmask).sum(axis=1).reshape((-1, 1))
     #q(s_next,a_next)
     next_actionmask = T.eq(
         T.arange(self.nactions).reshape((1, -1)),
         self.next_actions.reshape((-1, 1))).astype(theano.config.floatX)
     next_q_action = (get_output(self.network, self.next_states) *
                      next_actionmask).sum(axis=1).reshape((-1, 1))
     #loss = target - qvalue
     loss = (self.rewards + self.discount * next_q_action - q_action)
     #mse
     mse = 0.5 * loss**2
     #sum loss
     return T.sum(mse)
Ejemplo n.º 2
0
	def __init__(self, args):
		reward = T.col('r')
		action = T.icol('a')
		terminal = T.icol('t')
		discount = T.scalar('gamma')
		learningRate = T.scalar('lr')
		rho = T.scalar('rho')
		epsilon = T.scalar('eps')
		rng = np.random.RandomState(42)
		
		self.batchNb = args.batchSize
		
		#convLayers = [[(8,8),(4,4),64],
		#			  [(4,4),(2,2),128],
		#			  [(3,3),(1,1),256],
		#			  [(3,3),(1,1),512]]
		#fcl = [1024, 6]
		
		convLayers = [[(8,8),(4,4),64],
					  [(4,4),(2,2),128],
					  [(3,3),(1,1),256],
					  [(3,3),(1,1),256]]
		fcl = [1024, args.actionNb]
		self.q1 = NetStruct(convLayers, fcl, (4,100,100), rng, args)
		self.q2 = NetStruct(convLayers, fcl, (4,100,100), rng, args)
		self.q2.setParams(self.q1)
		
		self.states = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32'))
		self.states2 = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32'))
		self.actions = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True))
		self.rewards = theano.shared(np.zeros((args.batchSize,1), dtype='float32'), broadcastable=(False,True))
		self.terminals = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True))
		
		self.learningRate = theano.shared(np.array(args.learningRate, dtype='float32'))
		self.rho = theano.shared(np.array(args.rmsPropRho, dtype='float32'))
		self.epsilon = theano.shared(np.array(args.rmsPropEpsilon, dtype='float32'))
		self.discount = theano.shared(np.array(args.discountFactor, dtype='float32'))
		
		loss = self.QLoss(self.q1.output, self.q2.output, action, reward, terminal, discount)
		
		params = self.q1.getParams()
		
		updates = self.rmsProp(loss, params, rho, epsilon, learningRate)
		self.train_model = theano.function(
			[],
			loss,
			updates=updates,
			givens = { 
					   self.q1.input: self.states,
					   self.q2.input: self.states2,
					   action: self.actions,
					   reward: self.rewards,
					   terminal: self.terminals,
					   discount: self.discount,
					   learningRate: self.learningRate,
					   rho: self.rho,
					   epsilon: self.epsilon
					 }
		)
Ejemplo n.º 3
0
 def __init__(self, lenW, dimW, dimS):
     self.W = th.shared(np.random.randn(lenW, dimW))
     self.Uw = th.shared(np.random.randn(dimW, dimS))
     self.Us = th.shared(np.random.randn(dimS, dimS))
     self.V = th.shared(np.random.randn(dimS, lenW))
     self.S0 = th.shared(np.random.randn(dimS,))
     self.idx = T.icol()
     self.w = self.W[self.idx].reshape((self.idx.shape[0], self.W.shape[1]))
     def recurrence(w, s):
         # import ipdb; ipdb.set_trace()
         s1 = T.nnet.sigmoid(T.dot(w, self.Uw))
         s2 = T.nnet.sigmoid(T.dot(s, self.Us))
         ss = s1 + s2
         pp = T.dot(s, self.V)
         return [ss, pp]
     [self.S, self.PP], _ = th.scan(fn=recurrence, sequences=self.w, outputs_info=[self.S0, None], n_steps=self.w.shape[0])
     self.P = T.nnet.softmax(self.PP)
     self.RP = self.P[T.arange(self.w.shape[0]), self.idx[:,0]]
     self.cost = -T.sum(T.log(self.RP))
     self.params = [self.W, self.Uw, self.Us, self.V, self.S0]
     self.grads = T.grad(self.cost, self.params)
     self.lr = T.scalar()
     self.updates = map(lambda (param, grad): (param, param - self.lr * grad), zip(self.params, self.grads))
     self.train_fn = th.function([self.idx, self.lr], [self.cost], updates=self.updates, allow_input_downcast=True)
     self.fprop = th.function([self.idx], [self.S, self.P, self.cost], allow_input_downcast=True)
Ejemplo n.º 4
0
    def __init__(self, lenW, dimW, dimS):
        self.W = th.shared(np.random.randn(lenW, dimW))
        self.Uw = th.shared(np.random.randn(dimW, dimS))
        self.Us = th.shared(np.random.randn(dimS, dimS))
        self.V = th.shared(np.random.randn(dimS, lenW))
        self.S0 = th.shared(np.random.randn(dimS, ))
        self.idx = T.icol()
        self.w = self.W[self.idx].reshape((self.idx.shape[0], self.W.shape[1]))

        def recurrence(w, s):
            # import ipdb; ipdb.set_trace()
            s1 = T.nnet.sigmoid(T.dot(w, self.Uw))
            s2 = T.nnet.sigmoid(T.dot(s, self.Us))
            ss = s1 + s2
            pp = T.dot(s, self.V)
            return [ss, pp]

        [self.S, self.PP], _ = th.scan(fn=recurrence,
                                       sequences=self.w,
                                       outputs_info=[self.S0, None],
                                       n_steps=self.w.shape[0])
        self.P = T.nnet.softmax(self.PP)
        self.RP = self.P[T.arange(self.w.shape[0]), self.idx[:, 0]]
        self.cost = -T.sum(T.log(self.RP))
        self.params = [self.W, self.Uw, self.Us, self.V, self.S0]
        self.grads = T.grad(self.cost, self.params)
        self.lr = T.scalar()
        self.updates = map(
            lambda (param, grad): (param, param - self.lr * grad),
            zip(self.params, self.grads))
        self.train_fn = th.function([self.idx, self.lr], [self.cost],
                                    updates=self.updates,
                                    allow_input_downcast=True)
        self.fprop = th.function([self.idx], [self.S, self.P, self.cost],
                                 allow_input_downcast=True)
Ejemplo n.º 5
0
    def __init__(self,
                 state_shape,
                 num_actions,
                 epsilon=1.0,
                 epsilon_min=0.1,
                 epsilon_iter=100000,
                 discount=0.99,
                 lrate=1e-3,
                 batch_size=100,
                 q_update_iter=1000,
                 capacity=50000):

        if not isinstance(state_shape, tuple):
            raise AssertionError('state_shape must be of type <tuple>.')
        elif len(state_shape) == 0:
            raise AssertionError('No state space dimensions provided.')
        elif num_actions == 0:
            raise ValueError('Number of actions must be > 0.')
        elif epsilon_min is not None:
            assert epsilon_min < epsilon, 'Epsilon(min) must be < epsilon(max).'
        elif capacity < batch_size:
            raise ValueError('Replay capacity must be > batch_size.')

        self.state_shape = state_shape
        self.num_actions = num_actions
        self.q_network = build_network(state_shape, num_actions)
        self.q_targets = build_network(state_shape, num_actions)
        self.epsilon = epsilon
        self.epsilon_max = epsilon  # How greedy the policy is
        self.epsilon_min = epsilon_min
        self.epsilon_iter = float(epsilon_iter)
        self.discount = discount
        self.lr = lrate
        self.batch_size = batch_size  # How many samples to draw from buffer
        self.q_update_iter = q_update_iter  # Update the q_target every C iter
        self.step = 0
        self.replay_buffer = ReplayBuffer(capacity, state_shape)

        # Build training and sampling functions
        s0_sym = nn.get_all_layers(self.q_network)[0].input_var
        s1_sym = nn.get_all_layers(self.q_targets)[0].input_var
        a_sym = T.icol('actions')  #(n, 1)
        r_sym = T.col('rewards')
        t_sym = T.col('terminal_state')
        sym_vars = [s0_sym, a_sym, r_sym, s1_sym, t_sym]

        # Training phase uses non-deterministic mapping
        loss = T.sum(self._build_loss(*sym_vars, deterministic=False))
        params = nn.get_all_params(self.q_network, trainable=True)
        updates = lasagne.updates.adam(loss, params, self.lr, beta1=0.9)

        self.train_fn = theano.function(sym_vars, loss, updates=updates)

        # Build function for sampling from DQN
        pred = nn.get_output(self.q_network, deterministic=True)
        self.pred_fn = theano.function([s0_sym], pred)
Ejemplo n.º 6
0
    def __init__(self, **kwargs):
        # assign default values that must be present, or else the network will not work
        self.options = {
            "networktype": "CNN-BLSTM",
            "NUMBER_OF_CLASSES": 1,
            "N_L1": 200,
            "N_L2": 200,
            "DROPOUT_IN": 0.,
            "DROPOUT_LSTM": 0.1,
            "DROPOUT_OUT": 0.5,
            "DENSELAYER_NODES": 100,
            "L2": 0.00,
            "early_stopping": 10,
        }

        # load user supplied options
        for k in kwargs.keys():
            self.options[k] = kwargs[k]

        # define some variables
        self.options["BS_PR_SEQ"] = self.options[
            "SEQ_SIZE"]  # bases per sequence - actual sequence length
        self.options["FS"] = [
            self.options["BS_PR_SEQ"] -
            (self.options["FILTER_SIZES"][i] / len(self.options["VOCAB"])) + 1
            for i in range(len(self.options["FILTER_SIZES"]))
        ]
        self.options["ALL_F"] = sum(self.options["FS"])
        self.options["NUMBER_OF_CONV_LAYERS"] = len(
            self.options["FILTER_SIZES"])

        # temporary compatibility fix
        self.type = self.options["networktype"]
        self.VOCAB = self.options["VOCAB"]
        self.FS = self.options["FS"]
        self.ALL_F = self.options["ALL_F"]
        self.BS_PR_SEQ = self.options["BS_PR_SEQ"]
        #self.DROPOUT_LSTM = self.options["DROPOUT_LSTM"]
        self.GRAD_CLIP = self.options["GRAD_CLIP"]
        self.FILTER_SIZES = self.options["FILTER_SIZES"]

        #######################################################
        # symbolic variables                                  #
        #######################################################
        # Theano defines its computations using symbolic variables. A symbolic variable
        # is a matrix, vector, 3D matrix and specifies the data type.
        # A symbolic value does not hold any data, like a matlab matrix or np.array
        # Note that mask is constructed with a broadcastable argument which specifies
        # that the mask can be broadcasted in the 3. dimension.
        self.sym_input = T.tensor3('inputs')
        self.sym_target = T.icol('targets')

        # finally, build the model layers
        self.build_model()
Ejemplo n.º 7
0
    def __init__(self, input_width, input_height, output_dim, num_frames, batch_size):
        self.input_width = input_width
        self.input_height = input_height
        self.output_dim = output_dim
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.gamma = 0.99 # discount factor
        self.rho = 0.99
        self.lr = 0.00025 # learning rate
        self.momentum = 0.95
        self.freeze_targets = True

        self.l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size)
        if self.freeze_targets:
            self.next_l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
#        terminals = T.icol('terminals')

        self.states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX))
        self.next_states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False,True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True))
#        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True))

        q_vals = self.l_out.get_output(states / 255.0)
        if self.freeze_targets:
            next_q_vals = self.next_l_out.get_output(next_states / 255.0)
        else:
            next_q_vals = self.l_out.get_output(next_states / 255.0)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = rewards + self.gamma * T.max(next_q_vals, axis=1, keepdims=True)
        diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1,1))
        loss = T.mean(diff ** 2)

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
#            terminals: self.terminals_shared
        }
        if self.momentum > 0:
            updates = rmsprop_nesterov(loss, params, self.lr, self.rho, self.momentum, 1e-2)
        else:
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, 1e-6)
        self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
        self._q_vals = theano.function([], q_vals, givens={ states: self.states_shared })
Ejemplo n.º 8
0
    def _create_network(self):
        logger.info("Building network ...")
        net, input_var = self._build_network()
        target_values = T.matrix('target_output')
        actions = T.icol('actions')

        # Create masks
        # mask = theano.shared(np.zeros((self.batch_size, self.num_actions)).astype(np.int32))
        mask = T.zeros_like(target_values)
        mask = T.set_subtensor(
            mask[T.arange(self.batch_size),
                 actions.reshape((-1, ))], 1)

        # feed-forward path
        network_output = lasagne.layers.get_output(net, input_var / 255.0)

        # Add regularization penalty
        loss = squared_error(network_output * mask, target_values).mean()
        if self.weight_decay > 0.0:
            loss += regularize_network_params(net, l2) * self.weight_decay

        # Retrieve all parameters from the network
        all_params = lasagne.layers.get_all_params(net, trainable=True)

        # Compute updates for training
        if self.clip_error:
            grads = theano.gradient.grad(loss, all_params)
            grads = [
                lasagne.updates.norm_constraint(grad, self.clip_error,
                                                range(grad.ndim))
                for grad in grads
            ]
            updates = self.optimizer(grads,
                                     all_params,
                                     learning_rate=self.learning_rate,
                                     rho=self.decay_rate)
        else:
            updates = self.optimizer(loss,
                                     all_params,
                                     learning_rate=self.learning_rate,
                                     rho=self.decay_rate)

        # Theano functions for training and computing cost
        logger.info("Compiling functions ...")
        train = theano.function([input_var, target_values, actions],
                                [loss, network_output, target_values, mask],
                                updates=updates)
        predict = theano.function([input_var], network_output)

        return net, train, predict
Ejemplo n.º 9
0
    def _create_network(self):
        logger.info("Building network ...")
        net, input_var = self._build_network()
        target_values = T.matrix('target_output')
        actions = T.icol('actions')

        # Create masks
        # mask = theano.shared(np.zeros((self.batch_size, self.num_actions)).astype(np.int32))
        mask = T.zeros_like(target_values)
        mask = T.set_subtensor(mask[T.arange(self.batch_size), actions.reshape((-1,))], 1)

        # feed-forward path
        network_output = lasagne.layers.get_output(net, input_var / 255.0)

        # Add regularization penalty
        loss = squared_error(network_output * mask, target_values).mean()
        if self.weight_decay > 0.0:
            loss += regularize_network_params(net, l2) * self.weight_decay

        # Retrieve all parameters from the network
        all_params = lasagne.layers.get_all_params(net, trainable=True)

        # Compute updates for training
        if self.clip_error:
            grads = theano.gradient.grad(loss, all_params)
            grads = [lasagne.updates.norm_constraint(grad, self.clip_error, range(grad.ndim)) for grad in grads]
            updates = self.optimizer(grads, all_params, learning_rate=self.learning_rate, rho=self.decay_rate)
        else:
            updates = self.optimizer(loss, all_params, learning_rate=self.learning_rate, rho=self.decay_rate)

        # Theano functions for training and computing cost
        logger.info("Compiling functions ...")
        train = theano.function([input_var, target_values, actions], [loss, network_output, target_values, mask], updates=updates)
        predict = theano.function([input_var], network_output)

        return net, train, predict
Ejemplo n.º 10
0
def test_git_on_gip(hyper_params=None, rng_seed=1234):
    assert(not (hyper_params is None))
    # Initialize a source of randomness
    rng = np.random.RandomState(rng_seed)

    sup_count = 100
    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False)
    Xtr_su = datasets[0][0].get_value(borrow=False)
    Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32)
    Xtr_un = datasets[1][0].get_value(borrow=False)
    Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32)
    # get the joint labeled and unlabeled data
    Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX)
    Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]])
    # get the labeled data
    Xtr_su = Xtr_su.astype(theano.config.floatX)
    Ytr_su = Ytr_su[:,np.newaxis]
    # get observations and labels for the validation set
    Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX)
    Yva = datasets[2][1].get_value(borrow=False).astype(np.int32)
    Yva = Yva[:,np.newaxis] # numpy is dumb
    # get size information for the data
    un_samples = Xtr_un.shape[0]
    su_samples = Xtr_su.shape[0]
    va_samples = Xva.shape[0]

    # set up some symbolic variables for input/output
    Xp = T.matrix('Xp_base')
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    Yd = T.icol('Yd_base')

    # set some "shape" parameters for the networks
    data_dim = Xtr_un.shape[1]
    label_dim = 10
    prior_1_dim = 50
    prior_2_dim = 50
    prior_sigma = 1.0
    batch_size = 100

    ##################
    # SETUP A GIPAIR #
    ##################
    gn1_params = {}
    gn1_config = [prior_1_dim, 600, 600, data_dim]
    gn1_params['mlp_config'] = gn1_config
    gn1_params['activation'] = softplus_actfun
    gn1_params['out_type'] = 'bernoulli'
    gn1_params['lam_l2a'] = 1e-3
    gn1_params['vis_drop'] = 0.0
    gn1_params['hid_drop'] = 0.0
    gn1_params['bias_noise'] = 0.1
    # choose some parameters for the continuous inferencer
    in1_params = {}
    shared_config = [data_dim, 600, 600]
    top_config = [shared_config[-1], prior_1_dim]
    in1_params['shared_config'] = shared_config
    in1_params['mu_config'] = top_config
    in1_params['sigma_config'] = top_config
    in1_params['activation'] = softplus_actfun
    in1_params['lam_l2a'] = 1e-3
    in1_params['vis_drop'] = 0.0
    in1_params['hid_drop'] = 0.0
    in1_params['bias_noise'] = 0.1
    in1_params['input_noise'] = 0.0
    # Initialize the base networks for this GIPair
    IN1 = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \
            params=in1_params, shared_param_dicts=None)
    GN1 = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \
            params=gn1_params, shared_param_dicts=None)
    # Initialize biases in IN and GN
    IN1.init_biases(0.0)
    GN1.init_biases(0.0)
    # Initialize the GIPair
    GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN1, i_net=IN1, \
            data_dim=data_dim, prior_dim=prior_1_dim, \
            params=None, shared_param_dicts=None)
    # Set cost weighting parameters
    GIP.set_lam_nll(1.0)
    GIP.set_lam_kld(1.0)
    GIP.set_lam_l2w(1e-4)

    ##################
    # SETUP A GITRIP #
    ##################
    # set parameters for the generator network
    gn2_params = {}
    gn2_config = [(prior_2_dim + label_dim), 300, prior_1_dim]
    gn2_params['mlp_config'] = gn2_config
    gn2_params['activation'] = softplus_actfun
    gn2_params['out_type'] = 'gaussian'
    gn2_params['lam_l2a'] = 1e-3
    gn2_params['vis_drop'] = 0.0
    gn2_params['hid_drop'] = 0.0
    gn2_params['bias_noise'] = 0.1
    # choose some parameters for the continuous inferencer
    in2_params = {}
    shared_config = [prior_1_dim, 300]
    top_config = [shared_config[-1], prior_2_dim]
    in2_params['shared_config'] = shared_config
    in2_params['mu_config'] = top_config
    in2_params['sigma_config'] = top_config
    in2_params['activation'] = softplus_actfun
    in2_params['lam_l2a'] = 1e-3
    in2_params['vis_drop'] = 0.0
    in2_params['hid_drop'] = 0.0
    in2_params['bias_noise'] = 0.1
    in2_params['input_noise'] = 0.0
    # choose some parameters for the categorical inferencer
    pn2_params = {}
    pc0 = [prior_1_dim, 300, label_dim]
    pn2_params['proto_configs'] = [pc0]
    # Set up some spawn networks
    sc0 = {'proto_key': 0, 'input_noise': 0.0, 'bias_noise': 0.1, 'do_dropout': False}
    #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    pn2_params['spawn_configs'] = [sc0] #[sc0, sc1]
    pn2_params['spawn_weights'] = [1.0] #[0.5, 0.5]
    # Set remaining params
    pn2_params['activation'] = softplus_actfun
    pn2_params['ear_type'] = 6
    pn2_params['lam_l2a'] = 1e-3
    pn2_params['vis_drop'] = 0.0
    pn2_params['hid_drop'] = 0.0

    # Initialize the base networks for this GITrip
    GN2 = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \
            params=gn2_params, shared_param_dicts=None)
    IN2 = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \
            params=in2_params, shared_param_dicts=None)
    PN2 = PeaNet(rng=rng, Xd=Xd, params=pn2_params)
    # Initialize biases in GN, IN, and PN
    GN2.init_biases(0.0)
    IN2.init_biases(0.0)
    PN2.init_biases(0.0)

    # Initialize the GITrip
    GIT = GITrip(rng=rng, \
            Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \
            g_net=GN2, i_net=IN2, p_net=PN2, \
            data_dim=prior_1_dim, prior_dim=prior_2_dim, \
            label_dim=label_dim, batch_size=batch_size, \
            params=None, shared_param_dicts=None)
    # Set cost weighting parameters
    GIT.set_lam_nll(1.0)
    GIT.set_lam_kld(1.0)
    GIT.set_lam_cat(0.0)
    GIT.set_lam_pea(0.0)
    GIT.set_lam_ent(0.0)
    GIT.set_lam_l2w(1e-4)

    #####################################################
    # CONSTRUCT A GITonGIP STACKED, SEMI-SUPERVISED VAE #
    #####################################################
    GOG = GITonGIP(rng=rng, \
            Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \
            gip_vae=GIP, git_vae=GIT, \
            data_dim=data_dim, prior_1_dim=prior_1_dim, \
            prior_2_dim=prior_2_dim, label_dim=label_dim, \
            batch_size=batch_size, \
            params=None, shared_param_dicts=None)

    #################################
    # WRITE SOME INFO TO "LOG" FILE #
    #################################
    learn_rate_git = hyper_params['learn_rate_git']
    lam_pea_git = hyper_params['lam_pea_git']
    lam_cat_git = hyper_params['lam_cat_git']
    lam_ent_git = hyper_params['lam_ent_git']
    lam_l2w_git = hyper_params['lam_l2w_git']
    out_name = hyper_params['out_name']

    out_file = open(out_name, 'wb')
    out_file.write("**TODO: More informative output, and maybe a real log**\n")
    out_file.write("learn_rate_git: {0:.4f}\n".format(learn_rate_git))
    out_file.write("lam_pea_git: {0:.4f}\n".format(lam_pea_git))
    out_file.write("lam_cat_git: {0:.4f}\n".format(lam_cat_git))
    out_file.write("lam_ent_git: {0:.4f}\n".format(lam_ent_git))
    out_file.write("lam_l2w_git: {0:.4f}\n".format(lam_l2w_git))
    out_file.flush()

    ##################################################
    # TRAIN THE GIPair FOR SOME NUMBER OF ITERATIONS #
    ##################################################
    learn_rate = 0.002
    for i in range(250000):
        if ((i+1 % 100000) == 0):
            learn_rate = learn_rate * 0.8
        scale = min(1.0, (float(i+1) / 50000.0))
        GIP.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98)
        GIP.set_lam_nll(lam_nll=1.0)
        GIP.set_lam_kld(lam_kld=scale)
        # sample some unlabeled data to train with
        tr_idx = npr.randint(low=0,high=un_samples,size=(batch_size,))
        Xd_batch = binarize_data(Xtr_un.take(tr_idx, axis=0))
        Xc_batch = 0.0 * Xd_batch
        Xm_batch = 0.0 * Xd_batch
        # do a minibatch update of the model, and compute some costs
        outputs = GOG.train_gip(Xd_batch, Xc_batch, Xm_batch)
        joint_cost = 1.0 * outputs[0]
        data_nll_cost = 1.0 * outputs[1]
        post_kld_cost = 1.0 * outputs[2]
        other_reg_cost = 1.0 * outputs[3]
        if ((i % 1000) == 0):
            o_str = "batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \
                    i, joint_cost, data_nll_cost, post_kld_cost, other_reg_cost)
            print(o_str)
            out_file.write("{}\n".format(o_str))
            out_file.flush()
        if ((i % 5000) == 0):
            file_name = "GOG_GIP_SAMPLES_b{0:d}.png".format(i)
            Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0)
            sample_lists = GIP.sample_gil_from_data(Xd_samps, loop_iters=10)
            Xs = np.vstack(sample_lists["data samples"])
            utils.visualize_samples(Xs, file_name)

    ########################################################
    # REMOVE (SORT OF) UNUSED DIMENSIONS FROM LATENT SPACE #
    ########################################################
    #tr_idx = npr.randint(low=0,high=un_samples,size=(10000,))
    #Xd_batch = binarize_data(Xtr_un.take(tr_idx, axis=0))
    #Xp_batch = GIP.IN.mean_posterior(Xd_batch, 0.0*Xd_batch, 0.0*Xd_batch)
    #Xp_std = np.std(Xp_batch, axis=0, keepdims=True)
    #dim_mask = 1.0 * (Xp_std > 0.1)
    #GIT.set_input_mask(dim_mask)
    #print("MASK NNZ: {0:.4f}".format(np.sum(dim_mask)))

    ##################################################
    # TRAIN THE GITrip FOR SOME NUMBER OF ITERATIONS #
    ##################################################
    GIT.set_lam_l2w(lam_l2w=lam_l2w_git)
    learn_rate = learn_rate_git
    GIT.set_all_sgd_params(learn_rate=learn_rate, momentum=0.98)
    for i in range(250000):
        scale = 1.0
        if (i < 25000):
            scale = float(i+1) / 25000.0
        if ((i+1 % 50000) == 0):
            learn_rate = learn_rate * 0.8
        # do a minibatch update using unlabeled data
        if True:
            # get some data to train with
            un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,))
            Xd_un = binarize_data(Xtr_un.take(un_idx, axis=0))
            Yd_un = Ytr_un.take(un_idx, axis=0)
            Xc_un = 0.0 * Xd_un
            Xm_un = 0.0 * Xd_un
            # do a minibatch update of the model, and compute some costs
            GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98)
            GIT.set_lam_nll(1.0)
            GIT.set_lam_kld(scale * 1.0)
            GIT.set_lam_cat(0.0)
            GIT.set_lam_pea(scale * lam_pea_git)
            GIT.set_lam_ent(scale * lam_ent_git)
            outputs = GOG.train_git(Xd_un, Xc_un, Xm_un, Yd_un)
            joint_cost = 1.0 * outputs[0]
            data_nll_cost = 1.0 * outputs[1]
            post_kld_cost = 1.0 * outputs[2]
            post_cat_cost = 1.0 * outputs[3]
            post_pea_cost = 1.0 * outputs[4]
            post_ent_cost = 1.0 * outputs[5]
            other_reg_cost = 1.0 * outputs[6]
        if True:
            # get some data to train with
            su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,))
            Xd_su = binarize_data(Xtr_su.take(su_idx, axis=0))
            Yd_su = Ytr_su.take(su_idx, axis=0)
            Xc_su = 0.0 * Xd_su
            Xm_su = 0.0 * Xd_su
            # update only based on the label-based classification cost
            GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98)
            GIT.set_lam_nll(0.0)
            GIT.set_lam_kld(0.0)
            GIT.set_lam_cat(scale * lam_cat_git)
            GIT.set_lam_pea(scale * lam_pea_git)
            GIT.set_lam_ent(0.0)
            outputs = GOG.train_git(Xd_su, Xc_su, Xm_su, Yd_su)
            joint_2 = 1.0 * outputs[0]
            data_nll_2 = 1.0 * outputs[1]
            post_kld_2 = 1.0 * outputs[2]
            post_cat_cost = 1.0 * outputs[3]
            post_pea_2 = 1.0 * outputs[4]
            post_ent_2 = 1.0 * outputs[5]
            other_reg_cost = 1.0 * outputs[6]
        if ((i % 500) == 0):
            o_str = "batch: {0:d}, joint_cost: {1:.4f}, nll: {2:.4f}, kld: {3:.4f}, cat: {4:.4f}, pea: {5:.4f}, ent: {6:.4f}, other_reg: {7:.4f}".format( \
                    i, joint_cost, data_nll_cost, post_kld_cost, post_cat_cost, post_pea_cost, post_ent_cost, other_reg_cost)
            print(o_str)
            out_file.write("{}\n".format(o_str))
            out_file.flush()
            if ((i % 2500) == 0):
                # check classification error on training and validation set
                train_err = GOG.classification_error(Xtr_su, Ytr_su)
                va_err = GOG.classification_error(Xva, Yva)
                o_str = "    tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err)
                print(o_str)
                out_file.write("{}\n".format(o_str))
                out_file.flush()
        if ((i % 5000) == 0):
            file_name = "GoG_GIT_SAMPLES_b{0:d}.png".format(i)
            va_idx = npr.randint(low=0,high=va_samples,size=(5,))
            Xd_samps = np.vstack([Xd_un[0:5,:], binarize_data(Xva[va_idx,:])])
            Xd_samps = np.repeat(Xd_samps, 3, axis=0)
            sample_lists = GOG.sample_git_from_data(Xd_samps, loop_iters=10)
            Xs = np.vstack(sample_lists["data samples"])
            Ys = GOG.class_probs(Xs)
            Xs = mnist_prob_embed(Xs, Ys)
            utils.visualize_samples(Xs, file_name)
Ejemplo n.º 11
0
    def __init__(self, input, n_in, n_out):
        """ Initialize the parameters of the logistic regression

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
                      architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
                     which the datapoints lie

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
                      which the labels lie

        """

        # self._w = init_weights((n_in, n_out))
        # self._w_old = init_weights((n_in, n_out))
        self._w = init_tanh(n_in, n_out, 1234)
        self._w_old = init_tanh(n_in, n_out, 2235)
        print "Initial W " + str(self._w.get_value())
        # (n_out,) ,) used so that it can be added as row or column
        self._b = init_b_weights((n_out, ))
        self._b_old = init_b_weights((n_out, ))

        # learning rate for gradient descent updates.
        self._learning_rate = 0.005
        # future discount
        self._discount_factor = 0.8
        self._weight_update_steps = 5000
        self._updates = 0

        # data types for model
        State = T.fmatrix("State")
        ResultState = T.fmatrix("ResultState")
        Reward = T.col("Reward")
        Action = T.icol("Action")
        # Q_val = T.fmatrix()

        model = T.tanh(T.dot(State, self._w) + self._b)
        self._model = theano.function(inputs=[State],
                                      outputs=model,
                                      allow_input_downcast=True)

        q_val = self.model(State, self._w, self._b)
        action_pred = T.argmax(q_val, axis=1)

        # bellman error, delta error
        delta = ((Reward +
                  (self._discount_factor *
                   T.max(self.model(ResultState, self._w_old, self._b_old),
                         axis=1,
                         keepdims=True))) -
                 (self.model(State, self._w, self._b))[Action])
        # delta = ((Reward + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - T.max(self.model(State), axis=1,  keepdims=True))

        self._L2_reg = 0.01
        # L2 norm ; one regularization option is to enforce
        # L2 norm to be small
        self._L2 = ((self._w**2).sum())
        # total bellman cost
        # Squaring is important so errors do not cancel each other out.
        # mean is used instead of sum as it is more independent of parameter scale
        bellman_cost = T.mean(0.5 * ((delta)**2)) + (self._L2 * self._L2_reg)

        # Compute gradients w.r.t. model parameters
        gradient = T.grad(cost=bellman_cost, wrt=self._w)
        gradient_b = T.grad(cost=bellman_cost, wrt=self._b)
        """
            Updates to apply to parameters
            Performing gradient descent, want to add steps in the negative direction of 
            gradient.
        """
        update = [[self._w, self._w + (-gradient * self._learning_rate)],
                  [self._b, self._b + (-gradient_b * self._learning_rate)]]

        # This function performs one training step and update
        self._train = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=bellman_cost,
            updates=update,
            allow_input_downcast=True)
        # Used to get to predicted actions to select
        self._predict = theano.function(inputs=[State],
                                        outputs=action_pred,
                                        allow_input_downcast=True)
        self._q_values = theano.function(inputs=[State],
                                         outputs=q_val,
                                         allow_input_downcast=True)
        self._bellman_error = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=delta,
            allow_input_downcast=True)
Ejemplo n.º 12
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, 
                 update_rule, batch_accumulator, randomState, DoubleQ=False, TheQNet=NN):
        """ Initialize environment
        
        """
        QNetwork.__init__(self,environment, batch_size)

        
        self.rho = rho
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._DoubleQ = DoubleQ
        self._randomState = randomState
        
        QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState)

        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._input_dimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._input_dimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState)
        self.q_vals, self.params, shape_after_conv = QNet._buildDQN(states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_q_vals, self.next_params, shape_after_conv = QNet._buildDQN(next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        
        
        if(self._DoubleQ==True):
            givens_next={}
            for i, x in enumerate(self.next_states_shared):
                givens_next[ states[i] ] = x

            self.next_q_vals_current_qnet=theano.function([], self.q_vals,
                                          givens=givens_next)

            next_q_curr_qnet = theano.clone(self.next_q_vals)

            argmax_next_q_vals=T.argmax(next_q_curr_qnet, axis=1, keepdims=True)

            max_next_q_vals=self.next_q_vals[T.arange(batch_size),argmax_next_q_vals.reshape((-1,))].reshape((-1, 1))


        else:
            max_next_q_vals=T.max(self.next_q_vals, axis=1, keepdims=True)


        T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)

        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val=self.q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
        # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32...
        diff = - q_val + target 

        if self.clip_delta > 0:
            # This loss function implementation is taken from
            # https://github.com/spragunr/deep_q_rl
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss_ind = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss_ind = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss_ind)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss_ind)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
                
        gparams=[]
        for p in self.params:
            gparam =  T.grad(loss, p)
            gparams.append(gparam)

        updates = []
        
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            for i,(p, g) in enumerate(zip(self.params, gparams)):                
                acc = theano.shared(p.get_value() * 0.)
                acc_new = rho * acc + (1 - self.rho) * g ** 2
                gradient_scaling = T.sqrt(acc_new + self.rms_epsilon)
                g = g / gradient_scaling
                updates.append((acc, acc_new))
                updates.append((p, p - thelr * g))

        elif update_rule == 'sgd':
            for i, (param, gparam) in enumerate(zip(self.params, gparams)):
                updates.append((param, param - thelr * gparam))
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))
    
        
        if(self._DoubleQ==True):
            self._train = theano.function([thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        else:
            self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], self.q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
Ejemplo n.º 13
0
    def setup(self):
        lasagne.random.set_rng(self.rng)
        
        self.update_counter = 0

        self.l_out = self.build_q_network()              
               
        states = T.tensor3('states')
        next_states = T.tensor3('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        
        # Shared variables for training from a minibatch of replayed
        # state transitions, each consisting of an observation,
        # along with the chosen action and resulting
        # reward and terminal status.
        self.states_shared = theano.shared(
            np.zeros((self.batch_size, self.input_height, self.input_width), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
        self.actions_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        self.terminals_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        
        # Shared variable for a single state, to calculate q_vals.
        self.state_shared = theano.shared(
            np.zeros((self.input_height, self.input_width), dtype=theano.config.floatX))

        # Formulas
        q_vals = lasagne.layers.get_output(self.l_out, states / self.input_scale)
        
        next_q_vals = lasagne.layers.get_output(self.l_out, next_states / self.input_scale)
        next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
        
        terminalsX = terminals.astype(theano.config.floatX)
        action_mask = T.eq(T.arange(self.num_actions).reshape((1, -1)),
                          actions.reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards +
                  (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        output = (q_vals * action_mask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        loss = 0.5 * diff ** 2
        loss = T.sum(loss)
        #loss = T.mean(loss)

        # Params and givens            
        params = lasagne.layers.helper.get_all_params(self.l_out)  
        updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon)
        train_givens = {
            states: self.states_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)
        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.input_height,
                                               self.input_width))
        }
        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
Ejemplo n.º 14
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, input_scale=255.0, reward_bias=0.):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards + reward_bias +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff ** 2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff ** 2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
Ejemplo n.º 15
0
    def __init__(self, batch_size, input_dim, num_frames, action_dim, discount, lr_policy, lr_Q_val_f, memory_capability, defrozen_number, cliff_delta=0):
        self.input_dim = input_dim
        self.num_frames = num_frames
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.discount = discount
        self.lr = learning_rate
        self.policy_out, _ = self.build_policy()
        self.Q_val_f_out, l_in = self.build_Q_function()
        self.state_mem= np.zeros((memory_capability, num_frames, input_dim), dtype=theano.config.floatX)
        self.action_mem= np.zeros((memory_capability, action_dim), dtype=theano.config.floatX)
        self.reward_mem= np.zeros((memory_capability, 1), dtype='int32')
        self.next_states_mem=np.zeros((memory_capability, num_frames, input_dim), dtype=theano.config.floatX)
        self.curr_idx=0
        self.train_flag = False
        self.mem_full=False
        self.defrozen_number = defrozen_number
        self.cliff_delta= cliff_delta
        self.target_q_val_f = build_policy()
        self.target_policy =  build_Q_function()

        states = T.tensor3('states')
        next_states = T.tensor3('states')
        rewards = T.col('rewards')
        action = T.fmatrix('action')
        next_action = T.fmatrix('next_action')
        terminals = T.icol('terminals')

        lasagne.random.set_rng(self.rng)

        self.input_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_dim), 
            dtype=theano.config.floatX)
        )
        self.rewards_shared=theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True)
        )
        self.action_shared=theano.shared(
            np.zeros((batch_size, action_dim), dtype=theano.config.floatX),
            broadcastable=(False, True)
        )
        self.terminals_shared=theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True)
        )
        self.states_shared=theano.shared(
            np.zeros((batch_size ,num_frames, input_dim),
            dtype=theano.config.floatX)
        )
        self.next_state_shared=theano.shared(
            np.zeros((batch_size, num_frames, input_dim),
            dtype=theano.config.floatX)
        )
        self.next_action_shared=theano.shared(
            np.zeros(batch_size, 1), 
            theano.config.floatX
        )

        policy_action = lasagne.layers.get_output(self.policy_out, states)

        target_policy_action = lasagne.layers.get_output(self.target_policy, states)
        
        q_vals = lasagne.layers.get_output(self.Q_val_f_out, 
            {
                l_in[0]: states, 
                l_in[2]: action
            })
        
        target_q_val = lasagne.layers.get_output( 
            self.target_q_val_f,
            { 
                l_in[0]: next_states,
                l_in[2]: next_action
            })

        
        terminalsX=terminals.astype(theano.config.floaX)
        yi = (rewards +
                  (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * next_q_vals)

        diff = q_vals - yi
        if self.cliff_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.cliff_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.cliff_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        loss = T.mean(loss)

        train_Q_params = lasagne.layer.get_all_params(self.Q_val_f_out)
        train_Q_givens={
            states: self.states_shared,
            rewards: self.rewards_shared,
            action: self.action_shared,
            terminals: self.terminals_shared,
        }
        Q_updates = lasagne.updates.adam(loss, train_Q_params, self.lr_Q_val_f)
        self._train_Q = thenao.function([], [loss], updates, givens=train_Q_givens)
        
        train_policy_params = lasagne.layers.get_all_params(self.policy_out)
        d_train_policy_params = theano.gradient.grad()
        policy_updates = lasagne.updates.adam()
        self._q_vals = theano.function([], q_vals)
Ejemplo n.º 16
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network( network_type, input_width, input_height,
                                         num_actions, num_frames, batch_size )
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network( network_type, input_width, input_height,
                                                  num_actions, num_frames, batch_size )
            self.reset_q_hat( )

        states, next_states = T.tensor4( 'states' ), T.tensor4( 'next_states' )
        actions, rewards = T.icol( 'actions' ), T.col( 'rewards' )
        terminals = T.icol( 'terminals' )

        self.states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ),
                                                      dtype = theano.config.floatX ) )
        self.next_states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ),
                                                           dtype = theano.config.floatX ) )
        self.rewards_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = theano.config.floatX ),
                                             broadcastable = ( False, True ) )
        self.actions_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ),
                                             broadcastable = ( False, True ) )
        self.terminals_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ),
                                               broadcastable = ( False, True ) )
## Get learned Q-values
        q_vals_test = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = True )
        # q_vals_test = theano.gradient.disconnected_grad( q_vals_test )

        q_vals_train = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = False )
        
        if self.freeze_interval > 0:
            target_q_vals = lasagne.layers.get_output( self.next_l_out,
                                                       next_states / input_scale, deterministic = True)
        else:
            target_q_vals = lasagne.layers.get_output( self.l_out,
                                                       next_states / input_scale, deterministic = True)
            target_q_vals = theano.gradient.disconnected_grad( target_q_vals )
## The traget depends on the received rewards and the discounted future
##   reward stream for the given action in the current state.
        target = ( rewards + ( T.ones_like( terminals ) - terminals ) *
                             self.discount * T.max( target_q_vals, axis = 1, keepdims = True ) )
##  target - b x 1, where b is batch size.
##  q_vals - b x A, where A is the number of outputs of the Q-net
## Theano differentiates indexed (and reduced) arrays in a clever manner:
##  it sets all left out gradients to zero. THIS IS CORRECT!
## \nabla_\theta diff = - 1_{a = a_j} \nabla Q( s, a_j, \theta) \,.
        diff = target - q_vals_train[ T.arange( batch_size ), actions.reshape( ( -1, ) ) ].reshape( ( -1, 1 ) )

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)  
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None, self.momentum)

        self._train = theano.function([], loss, updates=updates, givens=givens)
        self._q_vals = theano.function([], q_vals_test, givens={states: self.states_shared})
	def __init__(self, input_width, input_height, num_actions,
				 num_frames, discount, learning_rate, rho,
				 rms_epsilon, momentum, clip_delta, freeze_interval,
				 batch_size, network_type, update_rule,
				 batch_accumulator, rng, input_scale=255.0):

		self.input_width = input_width
		self.input_height = input_height
		self.num_actions = num_actions
		self.num_frames = num_frames
		self.batch_size = batch_size
		self.discount = discount
		self.rho = rho
		self.lr = learning_rate
		self.rms_epsilon = rms_epsilon
		self.momentum = momentum
		self.clip_delta = clip_delta
		self.freeze_interval = freeze_interval
		self.rng = rng

		self.callback = None

		lasagne.random.set_rng(self.rng) #set the seed

		self.update_counter = 0

		self.l_out = self.build_network(network_type, input_width, input_height,
										num_actions, num_frames, batch_size)

		# 4-dimensional ndarray (similar to prestates in memory_store)
		states = T.tensor4('states')
		# 4-dimensional ndarray (similar to poststates in memory_store)
		next_states = T.tensor4('next_states') 
		rewards = T.col('rewards')
		actions = T.icol('actions')
		terminals = T.icol('terminals')

		# creating a shared object is like declaring global - it has be shared between functions that it appears in.
		# similar to prestates matrix construction in memory_store
		self.states_shared = theano.shared(
			np.zeros((batch_size, num_frames, input_height, input_width),
					 dtype=theano.config.floatX))

		self.next_states_shared = theano.shared(
			np.zeros((batch_size, num_frames, input_height, input_width),
					 dtype=theano.config.floatX))

		self.rewards_shared = theano.shared(
			np.zeros((batch_size, 1), dtype=theano.config.floatX),
			broadcastable=(False, True))

		self.actions_shared = theano.shared(
			np.zeros((batch_size, 1), dtype='int32'),
			broadcastable=(False, True))

		self.terminals_shared = theano.shared(
			np.zeros((batch_size, 1), dtype='int32'),
			broadcastable=(False, True))

		# compute an expression for the output of a single layer given its input
		# scaling turns grayscale (or) black and white to 1s and 0s (black OR white)

		q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
		
		next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale)

		next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

		
		#perform this step: Q(st,a) = rimm + gamma*[ max(a{t+1}) Q(s{t+1}, a{t+1})]
		#					col. of ones with same dim. as terminals								
		target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True))
		
		#										col. matrix into row matrix|row. matrix into col matrix
		diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
		
		# basically, we need to choose that 'a' (action) which maximizes Q(s,a)
		if self.clip_delta > 0: 
			quadratic_part = T.minimum(abs(diff), self.clip_delta)
			linear_part = abs(diff) - quadratic_part
			loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
		else:
			loss = 0.5 * diff ** 2

		loss = T.mean(loss)

		params = lasagne.layers.helper.get_all_params(self.l_out)  
		givens = {
			states: self.states_shared,
			next_states: self.next_states_shared,
			rewards: self.rewards_shared,
			actions: self.actions_shared,
			terminals: self.terminals_shared
		}

		if update_rule == 'rmsprop':
			updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon)
		elif update_rule == 'sgd':
			# param := param - learning_rate * gradient
			updates = lasagne.updates.sgd(loss, params, self.lr)
		else:
			print "Unrecognized update rule"
			sys.exit(1)

		if self.momentum > 0:
			updates = lasagne.updates.apply_momentum(updates, None, self.momentum)

		#							inputs,outputs
		self._train = theano.function([], [loss, q_vals], updates=updates,
									  givens=givens)
		self._q_vals = theano.function([], q_vals,
									   givens={states: self.states_shared})
Ejemplo n.º 18
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate,momentum,
                 batch_size, ):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.lr = learning_rate
        self.momentum = momentum

        self.update_counter = 0

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        from nn import network
        n, layers = network(n_channels=num_frames,
                            img_size=input_width, n_actions=num_actions)
        self.n = n
        q_vals = n.output(data_layer=states)
        next_q_vals = n.output(data_layer=next_states)
        next_q_vals = theano.gradient.disconnected_grad(next_q_vals)
        next_q_vals = T.minimum(0, next_q_vals)

        layers_samples = [l.output(data_layer=states) for l in layers]
        layers_batchstd = [T.mean(T.std(s, axis=0)) for s in layers_samples]
        w, b = n.weight(), n.bias()
        params = w + b

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        loss = T.mean(diff ** 2)

        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        updates = lasagne.updates.rmsprop(loss, params, self.lr)

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._batchstd = theano.function([], layers_batchstd,
                                         givens={states: self.states_shared})
        self._sample = theano.function([], layers_samples,
                                       givens={states: self.states_shared})
        self._q_vals = theano.function([states], q_vals,)
Ejemplo n.º 19
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, eta,  
		 params_share=True, double_learning=False, 
		 annealing=False, temp=1.0, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng
	self.eta = eta
	self.params_share = params_share
	self.double_learning = double_learning
	self.annealing = annealing
	self.temp0 = temp

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out, self.l_feature, self.l_init = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)

        if self.freeze_interval > 0:
            self.next_l_out, self.next_l_feature, self.next_l_init = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat_share()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
	exp_temp = T.scalar('exploration tuning')

        # Shared variables for training from a minibatch of replayed
        # state transitions, each consisting of num_frames + 1 (due to
        # overlap) images, along with the chosen action and resulting
        # reward and terminal status.
        self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames + 1, input_height, input_width),
                     dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
	self.exp_temp_shared = theano.shared(np.float32(self.temp0)) # default without annealing

        # Shared variable for a single state, to calculate q_vals.
        self.state_shared = theano.shared(
            np.zeros((num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        feature_vals = lasagne.layers.get_output(self.l_feature, states / input_scale)
        q_params = lasagne.layers.get_all_params(self.l_out)
        q_params_vals = lasagne.layers.get_all_param_values(self.l_out)
 	if self.params_share:
	    w_pi = q_params[-2]
	    b_pi = q_params[-1]
	else:
            params_init = lasagne.layers.get_all_param_values(self.l_init)
	    w_pi = theano.shared(params_init[-2])
	    b_pi = theano.shared(params_init[-1])

        pi_vals = T.nnet.softmax(exp_temp * (T.dot(feature_vals, w_pi) + b_pi))
        
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
	    if self.double_learning:
	        next_feature_vals = lasagne.layers.get_output(self.l_feature,
                                                    next_states / input_scale)
                next_q_params = lasagne.layers.get_all_params(self.l_out)
                next_q_params_vals = lasagne.layers.get_all_param_values(self.l_out)
 	        if self.params_share:
	            next_w_pi = next_q_params[-2]
	            next_b_pi = next_q_params[-1]
	        else:
                    next_params_init = lasagne.layers.get_all_param_values(self.l_init)
	            next_w_pi = theano.shared(next_params_init[-2])
	            next_b_pi = theano.shared(next_params_init[-1])
                next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi)) 
		next_pi_vals = theano.gradient.disconnected_grad(next_pi_vals)
	    else:
	        next_feature_vals = lasagne.layers.get_output(self.next_l_feature,
                                                    next_states / input_scale)
                next_q_params = lasagne.layers.get_all_params(self.next_l_out)
                next_q_params_vals = lasagne.layers.get_all_param_values(self.next_l_out)
 	        if self.params_share:
	            next_w_pi = next_q_params[-2]
	            next_b_pi = next_q_params[-1]
	        else:
                    next_params_init = lasagne.layers.get_all_param_values(self.next_l_init)
	            next_w_pi = theano.shared(next_params_init[-2])
	            next_b_pi = theano.shared(next_params_init[-1])

                next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi))       
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        terminalsX = terminals.astype(theano.config.floatX)
        actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
                          actions.reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards + (T.ones_like(terminalsX) - terminalsX) *
                 self.discount * T.sum(next_q_vals * next_pi_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

	if self.params_share:
            params = lasagne.layers.helper.get_all_params(self.l_out)  
	else:
	    params = lasagne.layers.helper.get_all_params(self.l_out)
	    params.append(next_w_pi)
	    params.append(next_b_pi)

        train_givens = {
            states: self.imgs_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared,
	    exp_temp: self.exp_temp_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)
        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames,
                                               self.input_height,
                                               self.input_width))
        }

        pi_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames,
                                               self.input_height,
                                               self.input_width)),
	    exp_temp: self.exp_temp_shared
        }

        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
        self._pi_vals = theano.function([], pi_vals[0], givens=pi_givens)

	grad_fc_w = T.grad(loss, self.l_out.W)
	self._grad = theano.function([], outputs=grad_fc_w,
				    givens=train_givens)
Ejemplo n.º 20
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 use_double, batch_size, network_type, update_rule,
                 batch_accumulator, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.use_double = use_double
        self.rng = rng

        # Using Double DQN is pointless without periodic freezing
        if self.use_double:
            assert self.freeze_interval > 0
            # pass

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        
        if self.freeze_interval > 0:
            # Nature. If using periodic freezing
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            # NIPS
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        if self.use_double:
            maxaction = T.argmax(q_vals, axis=1, keepdims=False)
            temptargets = next_q_vals[T.arange(batch_size),maxaction].reshape((-1, 1))
            target = (rewards +
                      (T.ones_like(terminals) - terminals) *
                      self.discount * temptargets)
        else:
            target = (rewards +
                      (T.ones_like(terminals) - terminals) *
                      self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)  
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        def inspect_inputs(i, node, fn):
            if ('maxand' not in str(node).lower() and '12345' not in str(node)):
                return
            print i, node, "input(s) value(s):", [input[0] for input in fn.inputs],
            raw_input('press enter')

        def inspect_outputs(i, node, fn):
            if ('maxand' not in str(node).lower() and '12345' not in str(node)):
                return
            if '12345' in str(node):
                print "output(s) value(s):", [np.asarray(output[0]) for output in fn.outputs]
            else:
                print "output(s) value(s):", [output[0] for output in fn.outputs]
            raw_input('press enter')

        if False:
            self._train = theano.function([], [loss, q_vals], updates=updates,
                                          givens=givens, mode=theano.compile.MonitorMode(
                            pre_func=inspect_inputs,
                            post_func=inspect_outputs))
            theano.printing.debugprint(target)
        else:
            self._train = theano.function([], [loss, q_vals], updates=updates,
                                          givens=givens)
        if False:
            self._q_vals = theano.function([], q_vals,
                                           givens={states: self.states_shared}, mode=theano.compile.MonitorMode(
                            pre_func=inspect_inputs,
                            post_func=inspect_outputs))
        else:
            self._q_vals = theano.function([], q_vals,
                                           givens={states: self.states_shared})
Ejemplo n.º 21
0
rewards_shared = theano.shared(np.zeros((mini_batch_size, 1),
                                        dtype=theano.config.floatX),
                               broadcastable=(False, True))

actions_shared = theano.shared(np.zeros((mini_batch_size, 1), dtype='int32'),
                               broadcastable=(False, True))

terminals_shared = theano.shared(np.zeros((mini_batch_size, 1), dtype='int32'),
                                 broadcastable=(False, True))
# 4-dimensional ndarray (similar to prestates in memory_store)
states = T.tensor4('states')
# 4-dimensional ndarray (similar to poststates in memory_store)
post_states = T.tensor4('post_states')
rewards = T.col('rewards')
actions = T.icol('actions')
terminals = T.icol('terminals')

givens = {
    states: states_shared,
    post_states: post_states_shared,
    rewards: rewards_shared,
    actions: actions_shared,
    terminals: terminals_shared
}


def build_net():
    """Build deeep q network, exactly as described in the deep mind paper  """
    input_layer = lasagne.layers.InputLayer(shape=(mini_batch_size,
                                                   history_length,
Ejemplo n.º 22
0
    def __init__(self, input_width, input_height, n_actions, discount, learn_rate, batch_size, rng):

        self.input_width = input_width
        self.input_height = input_height
        self.n_actions = n_actions
        self.discount = discount
        self.lr = learn_rate
        self.batch_size = batch_size
        self.rng = rng

        lasagne.random.set_rng(self.rng)

        self.l_out = self.build_network(batch_size, input_width, input_height, n_actions)

        states = t.tensor4("states")
        next_states = t.tensor4("next_states")
        rewards = t.col("rewards")
        actions = t.icol("actions")
        terminals = t.icol("terminals")

        self.states_shared = theano.shared(
            np.zeros((batch_size, 1, input_height, input_width), dtype=theano.config.floatX)
        )

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, 1, input_height, input_width), dtype=theano.config.floatX)
        )

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)
        )

        self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype="int32"), broadcastable=(False, True))

        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype="int32"), broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states)

        next_q_vals = lasagne.layers.get_output(self.l_out, next_states)
        next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = rewards + (t.ones_like(terminals) - terminals) * self.discount * t.max(
            next_q_vals, axis=1, keepdims=True
        )
        diff = target - q_vals[t.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))

        loss = t.sum(0.5 * diff ** 2)

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared,
        }

        updates = lasagne.updates.sgd(loss, params, self.lr)

        self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
        self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
Ejemplo n.º 23
0
    def __init__(self,
                 batchSize,
                 numFrames,
                 inputHeight,
                 inputWidth,
                 numActions,
                 discountRate,
                 learningRate,
                 rho,
                 rms_epsilon,
                 momentum,
                 networkUpdateDelay,
                 useSARSAUpdate,
                 kReturnLength,
                 networkType="conv",
                 updateRule="deepmind_rmsprop",
                 batchAccumulator="sum",
                 clipDelta=1.0,
                 inputScale=255.0):

        self.batchSize = batchSize
        self.numFrames = numFrames
        self.inputWidth = inputWidth
        self.inputHeight = inputHeight
        self.inputScale = inputScale
        self.numActions = numActions
        self.discountRate = discountRate
        self.learningRate = learningRate
        self.rho = rho
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.networkUpdateDelay = networkUpdateDelay
        self.useSARSAUpdate = useSARSAUpdate
        self.kReturnLength = kReturnLength
        self.networkType = networkType
        self.updateRule = updateRule
        self.batchAccumulator = batchAccumulator
        self.clipDelta = clipDelta
        self.updateCounter = 0

        states = T.tensor4("states")
        nextStates = T.tensor4("nextStates")
        rewards = T.col("rewards")
        actions = T.icol("actions")
        nextActions = T.icol("nextActions")
        terminals = T.icol("terminals")

        self.statesShared = theano.shared(
            np.zeros((self.batchSize, self.numFrames, self.inputHeight,
                      self.inputWidth),
                     dtype=theano.config.floatX))
        self.nextStatesShared = theano.shared(
            np.zeros((self.batchSize, self.numFrames, self.inputHeight,
                      self.inputWidth),
                     dtype=theano.config.floatX))
        self.rewardsShared = theano.shared(np.zeros(
            (self.batchSize, 1), dtype=theano.config.floatX),
                                           broadcastable=(False, True))
        self.actionsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                    dtype='int32'),
                                           broadcastable=(False, True))
        self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                        dtype='int32'),
                                               broadcastable=(False, True))
        self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                      dtype='int32'),
                                             broadcastable=(False, True))

        self.qValueNetwork = DeepNetworks.buildDeepQNetwork(
            self.batchSize, self.numFrames, self.inputHeight, self.inputWidth,
            self.numActions, self.networkType)

        qValues = lasagne.layers.get_output(self.qValueNetwork,
                                            states / self.inputScale)

        if self.networkUpdateDelay > 0:
            self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork(
                self.batchSize, self.numFrames, self.inputHeight,
                self.inputWidth, self.numActions, self.networkType)
            self.resetNextQValueNetwork()
            nextQValues = lasagne.layers.get_output(
                self.nextQValueNetwork, nextStates / self.inputScale)

        else:
            nextQValues = lasagne.layers.get_output(
                self.qValueNetwork, nextStates / self.inputScale)
            nextQValues = theano.gradient.disconnected_grad(nextQValues)

        if self.useSARSAUpdate:
            target = rewards + terminals * (
                self.discountRate**
                self.kReturnLength) * nextQValues[T.arange(self.batchSize),
                                                  nextActions.reshape(
                                                      (-1, ))].reshape((-1, 1))
        else:
            target = rewards + terminals * (
                self.discountRate**self.kReturnLength) * T.max(
                    nextQValues, axis=1, keepdims=True)

        targetDifference = target - qValues[T.arange(self.batchSize),
                                            actions.reshape((-1, ))].reshape(
                                                (-1, 1))

        quadraticPart = T.minimum(abs(targetDifference), self.clipDelta)
        linearPart = abs(targetDifference) - quadraticPart

        # if self.clipDelta > 0:
        #     targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta)

        if self.batchAccumulator == "sum":
            # loss = T.sum(targetDifference ** 2)
            loss = T.sum(0.5 * quadraticPart**2 + self.clipDelta * linearPart)
        elif self.batchAccumulator == "mean":
            # loss = T.mean(targetDifference ** 2)
            loss = T.mean(0.5 * quadraticPart**2 + self.clipDelta * linearPart)
        else:
            raise ValueError("Bad Network Accumulator. {sum, mean} expected")

        networkParameters = lasagne.layers.helper.get_all_params(
            self.qValueNetwork)

        if self.updateRule == "deepmind_rmsprop":
            updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters,
                                                    self.learningRate,
                                                    self.rho, self.rms_epsilon)
        elif self.updateRule == "rmsprop":
            updates = lasagne.updates.rmsprop(loss, networkParameters,
                                              self.learningRate, self.rho,
                                              self.rms_epsilon)
        elif self.updateRule == "sgd":
            updates = lasagne.updates.sgd(loss, networkParameters,
                                          self.learningRate)
        else:
            raise ValueError(
                "Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected")

        if self.momentum > 0:
            updates.lasagne.updates.apply_momentum(updates, None,
                                                   self.momentum)

        lossGivens = {
            states: self.statesShared,
            nextStates: self.nextStatesShared,
            rewards: self.rewardsShared,
            actions: self.actionsShared,
            nextActions: self.nextActionsShared,
            terminals: self.terminalsShared
        }

        self.__trainNetwork = theano.function([], [loss, qValues],
                                              updates=updates,
                                              givens=lossGivens,
                                              on_unused_input='warn')
        self.__computeQValues = theano.function(
            [], qValues, givens={states: self.statesShared})
Ejemplo n.º 24
0
    def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound):

        super(DeepRLNet3, self).__init__(n_in, n_out, state_bounds,
                                         action_bounds, reward_bound)

        batch_size = 32
        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, self._state_length)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size,
                                                    self._state_length)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.icol("Action")
        Action.tag.test_value = np.zeros((batch_size, 1),
                                         dtype=np.dtype('int64'))
        # create a small convolutional neural network
        inputLayerA = lasagne.layers.InputLayer((None, self._state_length),
                                                State)

        l_hid1A = lasagne.layers.DenseLayer(
            inputLayerA,
            num_units=256,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        l_hid2A = lasagne.layers.DenseLayer(
            l_hid1A,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3A = lasagne.layers.DenseLayer(
            l_hid2A,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outA = lasagne.layers.DenseLayer(
            l_hid3A,
            num_units=n_out,
            nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((n_out,))

        # self.updateTargetModel()
        inputLayerB = lasagne.layers.InputLayer((None, self._state_length),
                                                State)

        l_hid1B = lasagne.layers.DenseLayer(
            inputLayerB,
            num_units=256,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2B = lasagne.layers.DenseLayer(
            l_hid1B,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3B = lasagne.layers.DenseLayer(
            l_hid2B,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outB = lasagne.layers.DenseLayer(
            l_hid3B,
            num_units=n_out,
            nonlinearity=lasagne.nonlinearities.linear)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = 0.0002
        self._discount_factor = 0.8
        self._rho = 0.95
        self._rms_epsilon = 0.001

        self._weight_update_steps = 8000
        self._updates = 0

        self._states_shared = theano.shared(
            np.zeros((batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                             broadcastable=(False, True))

        self._actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                      dtype='int64'),
                                             broadcastable=(False, True))

        self._q_valsA = lasagne.layers.get_output(self._l_outA, State)
        self._q_valsB = lasagne.layers.get_output(self._l_outB, ResultState)

        self._q_func = self._q_valsA[T.arange(batch_size),
                                     Action.reshape((-1, ))].reshape((-1, 1))

        target = (
            Reward +
            #(T.ones_like(terminals) - terminals) *
            self._discount_factor *
            T.max(self._q_valsB, axis=1, keepdims=True))
        diff = target - self._q_valsA[
            T.arange(batch_size), Action.reshape((-1, ))].reshape(
                (-1,
                 1))  # Does some fancy indexing to get the column of interest

        loss = 0.5 * diff**2 + (
            1e-6 * lasagne.regularization.regularize_network_params(
                self._l_outA, lasagne.regularization.l2))
        loss = T.mean(loss)

        params = lasagne.layers.helper.get_all_params(self._l_outA)

        givens = {
            State: self._states_shared,
            ResultState: self._next_states_shared,
            Reward: self._rewards_shared,
            Action: self._actions_shared,
        }

        # SGD update

        updates = lasagne.updates.rmsprop(loss, params, self._learning_rate,
                                          self._rho, self._rms_epsilon)
        # TD update
        # updates = lasagne.updates.rmsprop(T.mean(self._q_func) + (1e-5 * lasagne.regularization.regularize_network_params(
        # self._l_outA, lasagne.regularization.l2)), params,
        #              self._learning_rate * -T.mean(diff), self._rho, self._rms_epsilon)

        self._train = theano.function([], [loss, self._q_valsA],
                                      updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([],
                                       self._q_valsA,
                                       givens={State: self._states_shared})

        self._bellman_error = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=diff,
            allow_input_downcast=True)
Ejemplo n.º 25
0
    def __init__(self, input, n_in, n_out):

        hidden_size = 36
        batch_size = 32
        self._w_h = init_weights((n_in, hidden_size))
        self._b_h = init_b_weights((1, hidden_size))
        # self._b_h = init_b_weights((hidden_size,))
        self._w_h2 = init_weights((hidden_size, hidden_size))
        self._b_h2 = init_b_weights((1, hidden_size))
        # self._b_h2 = init_b_weights((hidden_size,))
        # self._w_o = init_tanh(hidden_size, n_out)
        self._w_o = init_weights((hidden_size, n_out))
        self._b_o = init_b_weights((1, n_out))
        # self._b_o = init_b_weights((n_out,))

        self.updateTargetModel()
        self._w_h_old = init_weights((n_in, hidden_size))
        self._w_h2_old = init_weights((hidden_size, hidden_size))
        self._w_o_old = init_tanh(hidden_size, n_out)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = 0.00025
        self._discount_factor = 0.99

        self._weight_update_steps = 5000
        self._updates = 0

        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, 2)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size, 2)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.icol("Action")
        Action.tag.test_value = np.zeros((batch_size, 1),
                                         dtype=np.dtype('int32'))
        # Q_val = T.fmatrix()

        # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1)))
        # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True)
        _py_xA = self.model(State, self._w_h, self._b_h, self._w_h2,
                            self._b_h2, self._w_o, self._b_o, 0.0, 0.0)
        _py_xB = self.model(State, self._w_h_old, self._b_h_old,
                            self._w_h2_old, self._b_h2_old, self._w_o_old,
                            self._b_o_old, 0.0, 0.0)
        self._y_predA = T.argmax(_py_xA, axis=1)
        self._y_predB = T.argmax(_py_xB, axis=1)
        self._q_funcA = T.mean(
            (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2,
                        self._w_o, self._b_o, 0.0,
                        0.0))[T.arange(batch_size),
                              Action.reshape((-1, ))].reshape((-1, 1)))
        self._q_funcB = T.mean(
            (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old,
                        self._b_h2_old, self._w_o_old, self._b_o_old, 0.0,
                        0.0))[T.arange(batch_size),
                              Action.reshape((-1, ))].reshape((-1, 1)))
        # q_val = py_x
        # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self._L1_A = (abs(self._w_h).sum() + abs(self._w_h2).sum() +
                      abs(self._w_o).sum())
        self._L1_B = (abs(self._w_h_old).sum() + abs(self._w_h2_old).sum() +
                      abs(self._w_o_old).sum())
        self._L1_reg = 0.0
        self._L2_reg = 0.001
        # L2 norm ; one regularization option is to enforce
        # L2 norm to be small
        self._L2_A = ((self._w_h**2).sum() + (self._w_h2**2).sum() +
                      (self._w_o**2).sum())
        self._L2_B = ((self._w_h_old**2).sum() + (self._w_h2_old**2).sum() +
                      (self._w_o_old**2).sum())

        # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
        # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State))
        deltaA = ((Reward + (self._discount_factor * T.max(self.model(
            ResultState, self._w_h_old, self._b_h_old, self._w_h2_old,
            self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5),
                                                           axis=1,
                                                           keepdims=True))) -
                  (self.model(State, self._w_h, self._b_h, self._w_h2,
                              self._b_h2, self._w_o, self._b_o, 0.2,
                              0.5))[T.arange(Action.shape[0]),
                                    Action.reshape((-1, ))].reshape((-1, 1)))
        deltaB = (
            (Reward +
             (self._discount_factor *
              T.max(self.model(ResultState, self._w_h, self._b_h, self._w_h2,
                               self._b_h2, self._w_o, self._b_o, 0.2, 0.5),
                    axis=1,
                    keepdims=True))) -
            (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old,
                        self._b_h2_old, self._w_o_old, self._b_o_old, 0.2,
                        0.5))[T.arange(Action.shape[0]),
                              Action.reshape((-1, ))].reshape((-1, 1)))
        # bellman_cost = T.mean( 0.5 * ((delta) ** 2 ))
        bellman_costA = T.mean(0.5 * ((deltaA)**2)) + (
            self._L2_reg * self._L2_A) + (self._L1_reg * self._L1_A)
        bellman_costB = T.mean(0.5 * ((deltaB)**2)) + (
            self._L2_reg * self._L2_B) + (self._L1_reg * self._L1_B)

        paramsA = [
            self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o
        ]
        paramsB = [
            self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old,
            self._w_o_old, self._b_o_old
        ]
        # updates = sgd(bellman_cost, params, lr=self._learning_rate)
        updatesA = rlTDSGD(self._q_funcA,
                           T.mean(deltaA),
                           paramsA,
                           lr=self._learning_rate)
        updatesB = rlTDSGD(self._q_funcB,
                           T.mean(deltaB),
                           paramsB,
                           lr=self._learning_rate)
        # updates = RMSprop(bellman_cost, params, lr=self._learning_rate)
        # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate)
        # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01)
        # updatesA = lasagne.updates.rmsprop(self._q_funcA, paramsA, self._learning_rate * -T.mean(deltaA), 0.95, 0.01)
        # updatesB = lasagne.updates.rmsprop(self._q_funcB, paramsB, self._learning_rate * -T.mean(deltaB), 0.95, 0.01)

        self._trainA = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=bellman_costA,
            updates=updatesA,
            allow_input_downcast=True)
        self._trainB = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=bellman_costB,
            updates=updatesB,
            allow_input_downcast=True)
        self._bellman_errorA = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=deltaA,
            allow_input_downcast=True)
        self._bellman_errorB = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=deltaB,
            allow_input_downcast=True)
        self._q_valuesA = theano.function(inputs=[State],
                                          outputs=_py_xA,
                                          allow_input_downcast=True)
        self._q_valuesB = theano.function(inputs=[State],
                                          outputs=_py_xB,
                                          allow_input_downcast=True)
        self._py_xA = theano.function(inputs=[State],
                                      outputs=_py_xA,
                                      allow_input_downcast=True)
        self._py_xB = theano.function(inputs=[State],
                                      outputs=_py_xB,
                                      allow_input_downcast=True)

        x, y = T.matrices('x', 'y')
        z_lazy = ifelse(T.gt(T.max(x, axis=1)[0],
                             T.max(y, axis=1)[0]), T.argmax(x, axis=1),
                        T.argmax(y, axis=1))
        self._f_lazyifelse = theano.function([x, y],
                                             z_lazy,
                                             mode=theano.Mode(linker='vm'))
Ejemplo n.º 26
0
    l_in = lasagne.layers.InputLayer(
        shape=(None, num_frames, input_width, input_height)
    )

    l_conv = conv_layer(
        l_in,
        num_filters=16,
        filter_size=(8,8),
        stride=(4,4),
    )
    return l_conv

l_out = build_network()

rewards = T.col('rewards')
actions = T.icol('actions')
terminals = T.icol('terminals')

rewards_shared = theano.shared(
    np.zeros((batch_size, 1), dtype=theano.config.floatX),
    broadcastable=(False, True), name='rewards')

actions_shared = theano.shared(
    np.zeros((batch_size, 1), dtype='int32'),
    broadcastable=(False, True), name='actions')

givens = {
    rewards: rewards_shared,
    actions: actions_shared,
}
Ejemplo n.º 27
0
    def __init__(self, input_width, input_height, avail_actions, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, train_all, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.avail_actions = avail_actions
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng
        self.train_all = train_all

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        print "num_actions: " + str(num_actions)
        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)

        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
Ejemplo n.º 28
0
    def __init__(self, input_width, input_height, output_dim, num_frames,
                 batch_size):
        self.input_width = input_width
        self.input_height = input_height
        self.output_dim = output_dim
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.gamma = 0.95  # discount factor
        self.rho = 0.99
        self.lr = 0.00020  # learning rate
        self.momentum = 0.0
        self.freeze_targets = False

        self.l_out = self.build_small_network(input_width, input_height,
                                              output_dim, num_frames,
                                              batch_size)
        if self.freeze_targets:
            self.next_l_out = self.build_small_network(input_width,
                                                       input_height,
                                                       output_dim, num_frames,
                                                       batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        #        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))
        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))
        #        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True))

        q_vals = self.l_out.get_output(states / 255.0)
        if self.freeze_targets:
            next_q_vals = self.next_l_out.get_output(next_states / 255.0)
        else:
            next_q_vals = self.l_out.get_output(next_states / 255.0)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = rewards + self.gamma * T.max(
            next_q_vals, axis=1, keepdims=True)
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1, ))].reshape((-1, 1))
        loss = T.mean(diff**2)

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            #            terminals: self.terminals_shared
        }
        if self.momentum > 0:
            updates = rmsprop_nesterov(loss, params, self.lr, self.rho,
                                       self.momentum, 1e-2)
        else:
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              1e-6)
        self._train = theano.function([], [loss, q_vals],
                                      updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([],
                                       q_vals,
                                       givens={states: self.states_shared})
Ejemplo n.º 29
0
def test_gi_stack(hyper_params=None, sup_count=600, rng_seed=1234):
    assert(not (hyper_params is None))
    # Initialize a source of randomness
    rng = np.random.RandomState(rng_seed)

    # Load some data to train/validate/test with
    dataset = 'data/mnist.pkl.gz'
    datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False)
    Xtr_su = datasets[0][0].get_value(borrow=False)
    Ytr_su = datasets[0][1].get_value(borrow=False)
    Xtr_un = datasets[1][0].get_value(borrow=False)
    Ytr_un = datasets[1][1].get_value(borrow=False)
    # get the unlabeled data
    Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX)
    Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]).astype(np.int32)
    Ytr_un = 0 * Ytr_un
    # get the labeled data
    Xtr_su = Xtr_su.astype(theano.config.floatX)
    Ytr_su = Ytr_su[:,np.newaxis].astype(np.int32)
    # get observations and labels for the validation set
    Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX)
    Yva = datasets[2][1].get_value(borrow=False).astype(np.int32)
    Yva = Yva[:,np.newaxis] # numpy is dumb
    # get size information for the data
    un_samples = Xtr_un.shape[0]
    su_samples = Xtr_su.shape[0]
    va_samples = Xva.shape[0]

    # Construct a GenNet and an InfNet, then test constructor for GIPair.
    # Do basic testing, to make sure classes aren't completely broken.
    Xp = T.matrix('Xp_base')
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    Yd = T.icol('Yd_base')
    data_dim = Xtr_un.shape[1]
    label_dim = 10
    prior_dim = 50
    prior_sigma = 1.0
    batch_size = 150
    # Choose some parameters for the generator network
    gn_params = {}
    gn_config = [prior_dim, 600, 600, data_dim]
    gn_params['mlp_config'] = gn_config
    gn_params['activation'] = softplus_actfun
    gn_params['lam_l2a'] = 1e-3
    gn_params['vis_drop'] = 0.0
    gn_params['hid_drop'] = 0.0
    gn_params['bias_noise'] = 0.1
    # choose some parameters for the continuous inferencer
    in_params = {}
    shared_config = [data_dim, 600, 600]
    top_config = [shared_config[-1], prior_dim]
    in_params['shared_config'] = shared_config
    in_params['mu_config'] = top_config
    in_params['sigma_config'] = top_config
    in_params['activation'] = softplus_actfun
    in_params['init_scale'] = 2.0
    in_params['lam_l2a'] = 1e-3
    in_params['vis_drop'] = 0.0
    in_params['hid_drop'] = 0.0
    in_params['bias_noise'] = 0.1
    in_params['input_noise'] = 0.1
    # choose some parameters for the categorical inferencer
    pn_params = {}
    pc0 = [prior_dim, 800, 800, label_dim]
    pn_params['proto_configs'] = [pc0]
    # Set up some spawn networks
    sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
    pn_params['spawn_configs'] = [sc0, sc1]
    pn_params['spawn_weights'] = [0.5, 0.5]
    # Set remaining params
    pn_params['activation'] = relu_actfun
    pn_params['init_scale'] = 2.0
    pn_params['ear_type'] = 6
    pn_params['lam_l2a'] = 1e-3
    pn_params['vis_drop'] = 0.0
    pn_params['hid_drop'] = 0.5

    # Initialize the base networks for this GIPair
    GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \
            params=gn_params, shared_param_dicts=None)
    IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \
            params=in_params, shared_param_dicts=None)
    PN = PeaNet(rng=rng, Xd=Xd, params=pn_params)
    # Initialize biases in GN, IN, and PN
    GN.init_biases(0.0)
    IN.init_biases(0.0)
    PN.init_biases(0.1)
    # Initialize the GIStack
    GIS = GIStack(rng=rng, \
            Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \
            g_net=GN, i_net=IN, p_net=PN, \
            data_dim=data_dim, prior_dim=prior_dim, \
            label_dim=label_dim, batch_size=batch_size, \
            params=None, shared_param_dicts=None)
    # set weighting parameters for the various costs...
    GIS.set_lam_nll(1.0)
    GIS.set_lam_kld(1.0)
    GIS.set_lam_cat(0.0)
    GIS.set_lam_pea(0.0)
    GIS.set_lam_ent(0.0)

    # Set initial learning rate and basic SGD hyper parameters
    num_updates = hyper_params['num_updates']
    learn_rate = hyper_params['learn_rate']
    lam_pea = hyper_params['lam_pea']
    lam_cat = hyper_params['lam_cat']
    lam_ent = hyper_params['lam_ent']
    lam_l2w = hyper_params['lam_l2w']
    out_name = hyper_params['out_name']

    out_file = open(out_name, 'wb')
    out_file.write("**TODO: More informative output, and maybe a real log**\n")
    out_file.write("sup_count: {0:d}\n".format(sup_count))
    out_file.write("learn_rate: {0:.4f}\n".format(learn_rate))
    out_file.write("lam_pea: {0:.4f}\n".format(lam_pea))
    out_file.write("lam_cat: {0:.4f}\n".format(lam_cat))
    out_file.write("lam_ent: {0:.4f}\n".format(lam_ent))
    out_file.write("lam_l2w: {0:.4f}\n".format(lam_l2w))
    out_file.flush()

    GIS.set_lam_l2w(lam_l2w)
    GIS.set_all_sgd_params(learn_rate=learn_rate, momentum=0.98)
    for i in range(num_updates):
        if (i < 100000):
            # start with some updates only for the VAE (InfNet and GenNet)
            scale = float(min(i+1, 50000)) / 50000.0
            lam_cat = 0.0
            lam_pea = 0.0
            lam_ent = 0.0
            learn_rate_pn = 0.0
        else:
            # move on to updates that include loss from the PeaNet
            scale = 1.0
            lam_cat = hyper_params['lam_cat']
            lam_pea = hyper_params['lam_pea']
            if i < 150000:
                lam_ent = float(i - 99999) * hyper_params['lam_ent']
            else:
                lam_ent = hyper_params['lam_ent']
            learn_rate_pn = learn_rate
        if ((i+1 % 100000) == 0):
            learn_rate = learn_rate * 0.7
        # do a minibatch update using unlabeled data
        if True:
            # get some data to train with
            un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,))
            Xd_un = binarize_data(Xtr_un.take(un_idx, axis=0))
            Yd_un = Ytr_un.take(un_idx, axis=0)
            Xc_un = 0.0 * Xd_un
            Xm_un = 0.0 * Xd_un
            # do a minibatch update of the model, and compute some costs
            GIS.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98)
            GIS.set_pn_sgd_params(learn_rate=(scale*learn_rate_pn), momentum=0.98)
            GIS.set_lam_nll(1.0)
            GIS.set_lam_kld(0.01 + (0.99*scale))
            GIS.set_lam_cat(0.0)
            GIS.set_lam_pea(lam_pea)
            GIS.set_lam_ent(lam_ent)
            outputs = GIS.train_joint(Xd_un, Xc_un, Xm_un, Yd_un)
            joint_cost = 1.0 * outputs[0]
            data_nll_cost = 1.0 * outputs[1]
            post_kld_cost = 1.0 * outputs[2]
            post_cat_cost = 1.0 * outputs[3]
            post_pea_cost = 1.0 * outputs[4]
            post_ent_cost = 1.0 * outputs[5]
            other_reg_cost = 1.0 * outputs[6]
        # do another minibatch update incorporating label information
        if (i >= 100000):
            # get some data to train with
            su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,))
            Xd_su = binarize_data(Xtr_su.take(su_idx, axis=0))
            Yd_su = Ytr_su.take(su_idx, axis=0)
            Xc_su = 0.0 * Xd_su
            Xm_su = 0.0 * Xd_su
            # update only based on the label-based classification cost
            GIS.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98)
            GIS.set_pn_sgd_params(learn_rate=(scale*learn_rate_pn), momentum=0.98)
            GIS.set_lam_nll(0.0)
            GIS.set_lam_kld(0.0)
            GIS.set_lam_cat(lam_cat)
            GIS.set_lam_pea(lam_pea)
            GIS.set_lam_ent(0.0)
            outputs = GIS.train_joint(Xd_su, Xc_su, Xm_su, Yd_su)
            post_cat_cost = 1.0 * outputs[3]
        assert(not (np.isnan(joint_cost)))
        if ((i % 500) == 0):
            o_str = "batch: {0:d}, joint_cost: {1:.4f}, nll: {2:.4f}, kld: {3:.4f}, cat: {4:.4f}, pea: {5:.4f}, ent: {6:.4f}, other_reg: {7:.4f}".format( \
                    i, joint_cost, data_nll_cost, post_kld_cost, post_cat_cost, post_pea_cost, post_ent_cost, other_reg_cost)
            print(o_str)
            out_file.write("{}\n".format(o_str))
            if ((i % 1000) == 0):
                # check classification error on training and validation set
                train_err = GIS.classification_error(Xtr_su, Ytr_su)
                va_err = GIS.classification_error(Xva, Yva)
                o_str = "    tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err)
                print(o_str)
                out_file.write("{}\n".format(o_str))
            out_file.flush()
        if ((i % 5000) == 0):
            file_name = "GIS_SAMPLES_b{0:d}.png".format(i)
            va_idx = npr.randint(low=0,high=va_samples,size=(5,))
            Xd_samps = np.vstack([Xd_un[0:5,:], binarize_data(Xva[va_idx,:])])
            Xd_samps = np.repeat(Xd_samps, 3, axis=0)
            sample_lists = GIS.sample_gis_from_data(Xd_samps, loop_iters=10)
            Xs = np.vstack(sample_lists["data samples"])
            Ys = GIS.class_probs(Xs)
            Xs = mnist_prob_embed(Xs, Ys)
            utils.visualize_samples(Xs, file_name)
    print("TESTING COMPLETE!")
    out_file.close()
    return
Ejemplo n.º 30
0
    def initialize_network(self):
        """
        :description: this method initializes the network, updates, and theano functions for training and 
            retrieving q values. Here's an outline: 

            1. build the q network and target q network
            2. initialize theano symbolic variables used for compiling functions
            3. initialize the theano numeric variables used as input to functions
            4. formulate the symbolic loss 
            5. formulate the symbolic updates 
            6. compile theano functions for training and for getting q_values
        """
        batch_size, input_shape = self.batch_size, self.input_shape
        lasagne.random.set_rng(self.rng)

        # 1. build the q network and target q network
        self.l_out = self.build_network(input_shape, self.num_actions, batch_size)
        self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size)
        self.reset_target_network()

        # 2. initialize theano symbolic variables used for compiling functions
        states = T.tensor4('states')
        actions = T.icol('actions')
        rewards = T.col('rewards')
        next_states = T.tensor4('next_states')
        # terminals are used to indicate a terminal state in the episode and hence a mask over the future
        # q values i.e., Q(s',a')
        terminals = T.icol('terminals')

        # 3. initialize the theano numeric variables used as input to functions
        self.states_shape = (batch_size,) + (1,) + input_shape
        self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), 
            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # 4. formulate the symbolic loss 
        q_vals = lasagne.layers.get_output(self.l_out, states)
        next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states)
        target = (rewards +
                 (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector'
        diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))


        # a lot of the deepmind work clips the td error at 1 so we do that here
        # the problem is that gradient backpropagating through this minimum node
        # will be zero if diff is larger then 1.0 (because changing params before
        # the minimum does not impact the output of the minimum). To account for 
        # this we take the part of the td error (magnitude) greater than 1.0 and simply
        # add it to the loss, which allows gradient to backprop but just linearly
        # in the td error rather than quadratically
        quadratic_part = T.minimum(abs(diff), 1.0)
        linear_part = abs(diff) - quadratic_part
        loss = 0.5 * quadratic_part ** 2 + linear_part
        loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2)

        # 5. formulate the symbolic updates 
        params = lasagne.layers.helper.get_all_params(self.l_out)  
        updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate)

        # 6. compile theano functions for training and for getting q_values
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
        self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
Ejemplo n.º 31
0
    def __init__(self, env, args, rng, name = "DQNLasagne"):
        """ Initializes a network based on the Lasagne Theano framework.

        Args:
            env (AtariEnv): The envirnoment in which the agent actuates.
            args (argparse.Namespace): All settings either with a default value or set via command line arguments.
            rng (mtrand.RandomState): Initialized Mersenne Twister pseudo-random number generator.
            name (str): The name of the network object.

        Note:
            This function should always call the base class first to initialize
            the common values for the networks.
        """
        _logger.info("Initialize object of type " + str(type(self).__name__))
        super(DQNLasagne, self).__init__(env, args, rng, name)
        self.input_shape = (self.batch_size, self.sequence_length, args.frame_width, args.frame_height)
        self.dummy_batch = np.zeros(self.input_shape, dtype=np.uint8)
        lasagne.random.set_rng(self.rng)

        self.network = self._create_layer()

        # TODO: Load weights from pretrained network?!
        if not self.args.load_weights == None:
            self.load_weights(self.args.load_weights)

        if self.target_update_frequency > 0:
            self.target_network = self._create_layer()
            self._copy_theta()

        states = T.tensor4('states')
        followup_states = T.tensor4('followup_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
                np.zeros(self.input_shape, dtype=theano.config.floatX)
        )
        self.followup_states_shared = theano.shared(
                np.zeros(self.input_shape, dtype=theano.config.floatX)
        )
        self.rewards_shared = theano.shared(
                np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
                broadcastable=(False, True)
        )
        self.actions_shared = theano.shared(
                np.zeros((self.batch_size, 1), dtype='int32'),
                broadcastable=(False, True)
        )
        self.terminals_shared = theano.shared(
                np.zeros((self.batch_size, 1), dtype='int32'),
                broadcastable=(False, True)
        )

        qvalues = lasagne.layers.get_output(
                self.network,
                self._prepare_network_input(states)
        )

        if self.target_update_frequency > 0:
            qvalues_followup_states = lasagne.layers.get_output(
                    self.target_network,
                    self._prepare_network_input(followup_states)
            )
        else:
            qvalues_followup_states = lasagne.layers.get_output(
                    self.network,
                    self._prepare_network_input(followup_states)
            )
            qvalues_followup_states = theano.gradient.disconnected_grad(qvalues_followup_states)

        targets = (rewards +
                (T.ones_like(terminals) - terminals) *
                self.discount_rate *
                T.max(qvalues_followup_states, axis=1, keepdims=True)
        )
        errors = targets - qvalues[
                T.arange(self.batch_size),
                actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_error > 0:
            quadratic_part = T.minimum(abs(errors), self.clip_error)
            linear_part = abs(errors) - quadratic_part
            cost_function = T.sum(0.5 * quadratic_part ** 2 + self.clip_error * linear_part)
        else:
            cost_function = T.sum(0.5 * errors ** 2)

        self.params = lasagne.layers.helper.get_all_params(self.network)
        self.observations = {
            states: self.states_shared,
            followup_states: self.followup_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        self._set_optimizer(cost_function)

        if self.momentum > 0:
            self.optimizer = lasagne.updates.apply_momentum(
                    self.optimizer,
                    None,
                    self.momentum
            )
        _logger.debug("Compiling _theano_train")
        self._theano_train = theano.function(
                [],
                [cost_function, qvalues],
                updates=self.optimizer,
                givens=self.observations)
        _logger.debug("Compiling _theano_get_Q")
        self._theano_get_Q = theano.function(
                [],
                qvalues,
                givens={states: self.states_shared})

        self.callback = None
        _logger.debug("%s" % self)
Ejemplo n.º 32
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, update_rule,
                 batch_accumulator, state_count, input_scale=255.0):
                     
        self.state_count=state_count
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0
        
        self.l_out = self.build_nature_network_dnn(input_width, input_height,
                                        num_actions, num_frames, batch_size)
        
        if self.freeze_interval > 0:
            self.next_l_out = self.build_nature_network_dnn(input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.matrix('states')
        next_states = T.matrix('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

#buferis inputu viso batch
        self.states_shared = theano.shared(
            np.zeros((batch_size, state_count),
                     dtype=theano.config.floatX))

#buferis i koki state patenka visiem
        self.next_states_shared = theano.shared(
            np.zeros((batch_size, state_count),
                     dtype=theano.config.floatX))

#po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu?
        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

#po 1 priimta action kiekvienam episode
        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

#?? turbut 0 ir 1, ar paskutine verte ar ne
        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

#paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

#neaisku
        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff ** 2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff ** 2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))


#
        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)

        elif update_rule == 'adam':
            updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho,                                              self.rms_epsilon)
                                              
        elif update_rule == 'adagrad':
            updates = lasagne.updates.adagrad(loss, params, self.lr,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
            
        elif update_rule == 'momentum':
            updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum)

        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
Ejemplo n.º 33
0
    def __init__(self,
                 input_width,
                 input_height,
                 num_actions,
                 num_frames,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 clip_delta,
                 freeze_interval,
                 batch_size,
                 update_rule,
                 batch_accumulator,
                 state_count,
                 input_scale=255.0):

        self.state_count = state_count
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0

        self.l_out = self.build_nature_network_dnn(input_width, input_height,
                                                   num_actions, num_frames,
                                                   batch_size)

        if self.freeze_interval > 0:
            self.next_l_out = self.build_nature_network_dnn(
                input_width, input_height, num_actions, num_frames, batch_size)
            self.reset_q_hat()

        states = T.matrix('states')
        next_states = T.matrix('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        #buferis inputu viso batch
        self.states_shared = theano.shared(
            np.zeros((batch_size, state_count), dtype=theano.config.floatX))

        #buferis i koki state patenka visiem
        self.next_states_shared = theano.shared(
            np.zeros((batch_size, state_count), dtype=theano.config.floatX))

        #po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu?
        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        #po 1 priimta action kiekvienam episode
        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        #?? turbut 0 ir 1, ar paskutine verte ar ne
        self.terminals_shared = theano.shared(np.zeros((batch_size, 1),
                                                       dtype='int32'),
                                              broadcastable=(False, True))

        #paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards + (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1, ))].reshape((-1, 1))

        #neaisku
        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff**2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff**2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

#
        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)

        elif update_rule == 'adam':
            updates = lasagne.updates.adam(loss, params, self.lr, self.rho,
                                           self.rho, self.rms_epsilon)

        elif update_rule == 'adagrad':
            updates = lasagne.updates.adagrad(loss, params, self.lr,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)

        elif update_rule == 'momentum':
            updates = lasagne.updates.momentum(loss, params, self.lr,
                                               self.momentum)

        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals],
                                      updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([],
                                       q_vals,
                                       givens={states: self.states_shared})
Ejemplo n.º 34
0
def icol(name):
    return T.icol(name)
Ejemplo n.º 35
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, 
                 update_rule, batch_accumulator, randomState, frame_scale=255.0):
        """ Initialize environment

        Arguments:
            environment - the environment (class Env) 
            num_elements_in_batch - list of k integers for the number of each element kept as belief state
            num_actions - int
            discount - float
            learning_rate - float
            rho, rms_epsilon, momentum - float, float, float
            ...
            network_type - string 
            ...           
        """

        self._environment = environment
        
        self._batchSize = batchSize
        self._inputDimensions = self._environment.inputDimensions()
        self._nActions = self._environment.nActions()
        self._df = 0
        self.rho = rho
        self._lr = 0
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._randomState = randomState
        
        lasagne.random.set_rng(self._randomState)

        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._inputDimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batchSize, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))


        q_vals = lasagne.layers.get_output(self.l_out)        
        
        next_q_vals = lasagne.layers.get_output(self.next_l_out)
        
        max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True)
        
        T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)
        
        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1))

        diff = target - q_val

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        for conv_param in self.l_outs_conv:
            for p in lasagne.layers.helper.get_all_params(conv_param):
                params.append(p)
        
            
        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
        if update_rule == 'deepmind_rmsprop':
            grads = get_or_compute_grads(loss, params)
            updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, thelr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
Ejemplo n.º 36
0
    def __init__(self, env, args, rng, name="DQNLasagne"):
        """ Initializes a network based on the Lasagne Theano framework.

        Args:
            env (AtariEnv): The envirnoment in which the agent actuates.
            args (argparse.Namespace): All settings either with a default value or set via command line arguments.
            rng (mtrand.RandomState): Initialized Mersenne Twister pseudo-random number generator.
            name (str): The name of the network object.

        Note:
            This function should always call the base class first to initialize
            the common values for the networks.
        """
        _logger.info("Initialize object of type " + str(type(self).__name__))
        super(DQNLasagne, self).__init__(env, args, rng, name)
        self.input_shape = (self.batch_size, self.sequence_length,
                            args.frame_width, args.frame_height)
        self.dummy_batch = np.zeros(self.input_shape, dtype=np.uint8)
        lasagne.random.set_rng(self.rng)

        self.network = self._create_layer()

        # TODO: Load weights from pretrained network?!
        if not self.args.load_weights == None:
            self.load_weights(self.args.load_weights)

        if self.target_update_frequency > 0:
            self.target_network = self._create_layer()
            self._copy_theta()

        states = T.tensor4('states')
        followup_states = T.tensor4('followup_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros(self.input_shape, dtype=theano.config.floatX))
        self.followup_states_shared = theano.shared(
            np.zeros(self.input_shape, dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (self.batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((self.batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))
        self.terminals_shared = theano.shared(np.zeros((self.batch_size, 1),
                                                       dtype='int32'),
                                              broadcastable=(False, True))

        qvalues = lasagne.layers.get_output(
            self.network, self._prepare_network_input(states))

        if self.target_update_frequency > 0:
            qvalues_followup_states = lasagne.layers.get_output(
                self.target_network,
                self._prepare_network_input(followup_states))
        else:
            qvalues_followup_states = lasagne.layers.get_output(
                self.network, self._prepare_network_input(followup_states))
            qvalues_followup_states = theano.gradient.disconnected_grad(
                qvalues_followup_states)

        targets = (rewards +
                   (T.ones_like(terminals) - terminals) * self.discount_rate *
                   T.max(qvalues_followup_states, axis=1, keepdims=True))
        errors = targets - qvalues[T.arange(self.batch_size),
                                   actions.reshape((-1, ))].reshape((-1, 1))

        if self.clip_error > 0:
            quadratic_part = T.minimum(abs(errors), self.clip_error)
            linear_part = abs(errors) - quadratic_part
            cost_function = T.sum(0.5 * quadratic_part**2 +
                                  self.clip_error * linear_part)
        else:
            cost_function = T.sum(0.5 * errors**2)

        self.params = lasagne.layers.helper.get_all_params(self.network)
        self.observations = {
            states: self.states_shared,
            followup_states: self.followup_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        self._set_optimizer(cost_function)

        if self.momentum > 0:
            self.optimizer = lasagne.updates.apply_momentum(
                self.optimizer, None, self.momentum)
        _logger.debug("Compiling _theano_train")
        self._theano_train = theano.function([], [cost_function, qvalues],
                                             updates=self.optimizer,
                                             givens=self.observations)
        _logger.debug("Compiling _theano_get_Q")
        self._theano_get_Q = theano.function(
            [], qvalues, givens={states: self.states_shared})

        self.callback = None
        _logger.debug("%s" % self)
Ejemplo n.º 37
0
    def __init__(self, num_actions, phi_length, width, height,
                 discount, learning_rate, decay, momentum=0,
                 batch_size=32,
                 approximator='none'):
        self._batch_size = batch_size
        self._num_input_features = phi_length
        self._phi_length = phi_length
        self._img_width = width
        self._img_height = height
        self._discount = discount
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.scale_input_by = 255.0

        # CONSTRUCT THE LAYERS
        self.q_layers = []
        self.q_layers.append(layers.Input2DLayer(self._batch_size,
                                               self._num_input_features,
                                               self._img_height,
                                               self._img_width,
                                               self.scale_input_by))

        if approximator == 'cuda_conv':
            self.q_layers.append(cc_layers.ShuffleBC01ToC01BLayer(
                    self.q_layers[-1]))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=16,
                                                 filter_size=8,
                                                 stride=4,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=32,
                                                 filter_size=4,
                                                 stride=2,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(cc_layers.ShuffleC01BToBC01Layer(
                    self.q_layers[-1]))

        elif approximator == 'conv':
            self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1],
                                                         n_filters=16,
                                                         filter_width=8,
                                                         filter_height=8,
                                                         stride_x=4,
                                                         stride_y=4,
                                                         weights_std=.01,
                                                         init_bias_value=0.01))

            self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1],
                                                         n_filters=32,
                                                         filter_width=4,
                                                         filter_height=4,
                                                         stride_x=2,
                                                         stride_y=2,
                                                         weights_std=.01,
                                                         init_bias_value=0.01))
        if approximator == 'cuda_conv' or approximator == 'conv':

            self.q_layers.append(layers.DenseLayer(self.q_layers[-1],
                                                   n_outputs=256,
                                                   weights_std=0.01,
                                                   init_bias_value=0.1,
                                                   dropout=0,
                                                   nonlinearity=layers.rectify))

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=num_actions,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.identity))


        if approximator == 'none':
            self.q_layers.append(\
                layers.DenseLayerNoBias(self.q_layers[-1],
                                        n_outputs=num_actions,
                                        weights_std=0.00,
                                        dropout=0,
                                        nonlinearity=layers.identity))


        self.q_layers.append(layers.OutputLayer(self.q_layers[-1]))

        for i in range(len(self.q_layers)-1):
            print self.q_layers[i].get_output_shape()


        # Now create a network (using the same weights)
        # for next state q values
        self.next_layers = copy_layers(self.q_layers)
        self.next_layers[0] = layers.Input2DLayer(self._batch_size,
                                                  self._num_input_features,
                                                  self._img_width,
                                                  self._img_height,
                                                  self.scale_input_by)
        self.next_layers[1].input_layer = self.next_layers[0]

        self.rewards = T.col()
        self.actions = T.icol()

        # Build the loss function ...
        q_vals = self.q_layers[-1].predictions()
        next_q_vals = self.next_layers[-1].predictions()
        next_maxes = T.max(next_q_vals, axis=1, keepdims=True)
        target = self.rewards + discount * next_maxes
        target = theano.gradient.consider_constant(target)
        diff = target - q_vals
        # Zero out all entries for actions that were not chosen...
        mask = build_mask(T.zeros_like(diff), self.actions, 1.0)
        diff_masked = diff * mask
        error = T.mean(diff_masked ** 2)
        self._loss = error * diff_masked.shape[1] #

        self._parameters = layers.all_parameters(self.q_layers[-1])

        self._idx = T.lscalar('idx')

        # CREATE VARIABLES FOR INPUT AND OUTPUT
        self.states_shared = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.states_shared_next = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(
            np.zeros((1, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
        self.actions_shared = theano.shared(
            np.zeros((1, 1), dtype='int32'), broadcastable=(False, True))

        self._givens = \
            {self.q_layers[0].input_var:
             self.states_shared[self._idx*self._batch_size:
                                (self._idx+1)*self._batch_size, :, :, :],
             self.next_layers[0].input_var:
             self.states_shared_next[self._idx*self._batch_size:
                                     (self._idx+1)*self._batch_size, :, :, :],

             self.rewards:
             self.rewards_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :],
             self.actions:
             self.actions_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :]
             }

        if self.momentum != 0:
            self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\
                self._loss, self._parameters, learning_rate=self.learning_rate,
                rho=self.decay, momentum=self.momentum, epsilon=1e-6)
        else:
            self._updates = layers.gen_updates_rmsprop(self._loss,
                self._parameters, learning_rate=self.learning_rate,
                rho=self.decay, epsilon=1e-6)

        self._train = theano.function([self._idx], self._loss,
                                      givens=self._givens,
                                      updates=self._updates)
        self._compute_loss = theano.function([self._idx],
                                             self._loss,
                                             givens=self._givens)
        self._compute_q_vals = \
            theano.function([self.q_layers[0].input_var],
                            self.q_layers[-1].predictions(),
                            on_unused_input='ignore')
Ejemplo n.º 38
0
def train():

    # initialize game
    direction, launchBubble, newBubble, arrow, bubbleArray, nextBubble, score, alive, shots, getout, loss_game = restartGame(
    )

    # hyperparameters
    epsilon = 0.9

    # counters
    moves = 0
    wins = 0
    gameover = 0
    games = 0
    average_loss = 0
    average_reward = 0

    # with or without display
    display = False
    delay = 0

    # Tensor types
    STATE = T.tensor4()
    NEWSTATE = T.tensor4()
    REWARD = T.icol()
    DISCOUNT = T.col()
    ACTION = T.icol()

    # building network
    network = build_network()
    target_network = build_network()

    # get parameters from trained network
    """
	with np.load('5_colours_20shots.npz') as f:
		param_values = [f['arr_%d' % i] for i in range(len(f.files))]
	lasagne.layers.set_all_param_values(network, param_values)"""

    params = lasagne.layers.get_all_params(network)

    all_params = lasagne.layers.helper.get_all_param_values(network)
    lasagne.layers.helper.set_all_param_values(target_network, all_params)

    # get maximum q_value and particular action
    qvals = lasagne.layers.get_output(network, STATE)
    bestAction = qvals.argmax(-1)
    qval = qvals[0][ACTION]

    # get max Q_value of next state
    next_q_vals = lasagne.layers.get_output(target_network, NEWSTATE)
    maxNextValue = next_q_vals.max()

    # loss function with Stochastic Gradient Descent
    target = (REWARD + DISCOUNT * T.max(next_q_vals, axis=1, keepdims=True))
    diff = target - qvals[T.arange(BATCHSIZE),
                          ACTION.reshape((-1, ))].reshape((-1, 1))
    loss = 0.5 * diff**2
    loss = T.mean(loss)
    grad = T.grad(loss, params)
    updates = lasagne.updates.rmsprop(grad, params, learning_rate)
    updates = lasagne.updates.apply_momentum(updates, params, 0.9)

    # theano function for training and predicting q_values
    f_train = theano.function([STATE, ACTION, REWARD, NEWSTATE, DISCOUNT],
                              loss,
                              updates=updates,
                              allow_input_downcast=True)
    f_predict = theano.function([STATE], bestAction, allow_input_downcast=True)
    f_qvals = theano.function([STATE], qvals, allow_input_downcast=True)
    f_max = theano.function([NEWSTATE],
                            maxNextValue,
                            allow_input_downcast=True)

    # get state
    state = gameState(bubbleArray, newBubble.color)
    while moves < ITERATIONS:

        if display == True:
            DISPLAYSURF.fill(BGCOLOR)
        # act random or greedy
        chance = random.uniform(0, 1)
        launchBubble = True
        if chance < epsilon:
            action = random.randint(0, NUMBEROFACTIONS - 1)
        else:
            predict_state = np.reshape(state,
                                       (1, 8, GRIDSIZE * 2, ARRAYWIDTH * 2))
            action = int(f_predict(predict_state))
        direction = (action * 8) + 10
        newBubble.angle = direction

        # process game
        bubbleArray, alive, deleteList, nextBubble = processGame(
            launchBubble, newBubble, bubbleArray, score, arrow, direction,
            alive, display, delay)

        # get reward for the action
        getout, wins, reward, gameover = getReward(alive, getout, wins,
                                                   deleteList, gameover)

        # getting new bubble for shooting
        newBubble = Bubble(nextBubble.color)
        newBubble.angle = arrow.angle

        # get the newstate
        newState = gameState(bubbleArray, newBubble.color)

        # storage of replay memory
        if getout == True:
            REPLAYMEMORY.append((state, action, reward, newState, 0))
        else:
            REPLAYMEMORY.append((state, action, reward, newState, discount))

        # delete one tuple is replay memory becomes too big
        if len(REPLAYMEMORY) > size_RM:
            REPLAYMEMORY.pop(0)

        # training the network
        states, actions, rewards, newstates, discounts = get_batch()
        loss = f_train(states, actions, rewards, newstates, discounts)

        average_loss = average_loss + loss
        average_reward = average_reward + reward

        if moves % 1000 == 0 and moves > 0:
            print("Amount of actions taken: ", moves)
            print("Average loss: ", average_loss / 1000.0)
            print("Average Reward: ", average_reward / 1000.0)
            print("Amount of wins: ", wins)
            average_reward = 0
            average_loss = 0
            if epsilon > 0.1:
                epsilon = epsilon - 0.01

        # updating the target network
        if moves % 2500 == 0:
            target_network = build_network()
            all_param_values = lasagne.layers.get_all_param_values(network)
            lasagne.layers.set_all_param_values(target_network,
                                                all_param_values)

        # change the state to newState
        state = newState

        moves = moves + 1
        shots = shots + 1

        if getout == True or shots == AMOUNTOFSHOTS:
            games = games + 1
            direction, launchBubble, newBubble, arrow, bubbleArray, nextBubble, score, alive, shots, getout, loss_game = restartGame(
            )
        state = gameState(bubbleArray, newBubble.color)

    # saving parameters of the network
    np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
    return network
Ejemplo n.º 39
0
    def __init__(self, stateSize, actionSize, numFrames, batchSize, discount,
                 rho, momentum, learningRate, rmsEpsilon, rng, updateRule,
                 batchAccumulator, freezeInterval):
        self.stateSize = stateSize
        self.actionSize = actionSize
        self.numFrames = numFrames
        self.batchSize = batchSize
        self.discount = discount
        self.rho = rho
        self.momentum = momentum
        self.learningRate = learningRate
        self.rmsEpsilon = rmsEpsilon
        self.rng = rng
        self.updateRule = updateRule
        self.batchAccumulator = batchAccumulator
        self.freezeInterval = freezeInterval

        lasagne.random.set_rng(self.rng)

        self.updateCounter = 0

        self.lOut = self.buildNetwork(self.stateSize, self.actionSize,
                                      self.numFrames, self.batchSize)

        if self.freezeInterval > 0:
            self.nextLOut = self.buildNetwork(self.stateSize, self.actionSize,
                                              self.numFrames, self.batchSize)
            self.resetQHat()

        states = T.ftensor3('states')
        nextStates = T.ftensor3('nextStates')
        rewards = T.fcol('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Shared variables for teaching from a minibatch of replayed
        # state transitions, each consisting of num_frames + 1 (due to
        # overlap) states, along with the chosen action and resulting
        # reward and termninal status.
        self.states_shared = theano.shared(
            numpy.zeros((self.batchSize, self.numFrames + 1, self.stateSize),
                        dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(numpy.zeros(
            (self.batchSize, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(numpy.zeros((self.batchSize, 1),
                                                        dtype='int32'),
                                            broadcastable=(False, True))
        self.terminals_shared = theano.shared(numpy.zeros((self.batchSize, 1),
                                                          dtype='int32'),
                                              broadcastable=(False, True))

        # Shared variable for a single state, to calculate qVals
        self.state_shared = theano.shared(
            numpy.zeros((self.numFrames, self.stateSize),
                        dtype=theano.config.floatX))

        qVals = lasagne.layers.get_output(self.lOut, states)

        if self.freezeInterval > 0:
            nextQVals = lasagne.layers.get_output(self.nextLOut, nextStates)
        else:
            nextQVals = lasagne.layers.get_output(self.lOut, nextStates)
            nextQVals = theano.gradient.disconnected_grad(nextQVals)

        # Cast terminals to floatX
        terminalsX = terminals.astype(theano.config.floatX)
        # T.eq(a,b) returns a variable representing the nogical
        # EQuality (a==b)
        actionmask = T.eq(
            T.arange(self.actionSize).reshape((1, -1)), actions.reshape(
                (-1, 1))).astype(theano.config.floatX)

        target = (rewards + (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(nextQVals, axis=1, keepdims=True))
        output = (qVals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        # no if clip delta, since clip-delta=0

        loss = (diff**2)

        if self.batchAccumulator == 'sum':
            loss = T.sum(loss)
        elif self.batchAccumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError('Bad accumulator: {}'.format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.lOut)
        train_givens = {
            states: self.states_shared[:, :-1],
            nextStates: self.states_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        if self.updateRule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.learningRate,
                                              self.rho, self.rmsEpsilon)

        elif self.updateRule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.learningRate,
                                       self.rho, self.rmsEpsilon)
        else:
            raise ValueError('Unrecognized update: {}'.format(updateRule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss],
                                      updates=updates,
                                      givens=train_givens)
        q_givens = {
            states: self.state_shared.reshape(
                (1, self.numFrames, self.stateSize))
        }

        # self._q_vals=theano.function([],qVals[0], givens=q_givens)
        self._q_vals = theano.function([], qVals[0], givens=q_givens)
    def __init__(self,
                 input_size,
                 output_size,
                 hidden_units,
                 train_iterations=50000,
                 eps=1.0,
                 batch_size=10,
                 discount_factor=0.0,
                 reg_factor=0.0,
                 lr=0.001,
                 train=True):

        self.batch_size = 10
        self.init_eps = eps
        self.epsilon = eps
        self.iterations = train_iterations
        self.num_experiences = 0
        self.train = train

        state_length = 1
        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, state_length)

        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size, state_length)

        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)

        Action = T.icol("Action")
        Action.tag.test_value = np.zeros((batch_size, 1),
                                         dtype=np.dtype('int32'))

        # create 2 separate neural network
        l_inA = lasagne.layers.InputLayer((None, state_length), State)
        l_inB = lasagne.layers.InputLayer((None, state_length), State)
        for units in hidden_units:
            l_hiddenA = lasagne.layers.DenseLayer(
                l_inA,
                num_units=units,
                nonlinearity=lasagne.nonlinearities.rectify)
            l_hiddenB = lasagne.layers.DenseLayer(
                l_inB,
                num_units=units,
                nonlinearity=lasagne.nonlinearities.rectify)
            l_inA = l_hiddenA
            l_inB = l_hiddenB
        self._l_outA = lasagne.layers.DenseLayer(
            l_inA,
            num_units=output_size,
            nonlinearity=lasagne.nonlinearities.linear)
        self._l_outB = lasagne.layers.DenseLayer(
            l_inB,
            num_units=output_size,
            nonlinearity=lasagne.nonlinearities.linear)

        self._learning_rate = lr
        self._discount_factor = discount_factor
        self._rho = 0.95
        self._rms_epsilon = 0.005

        self._weight_update_steps = 100
        self._updates = 0

        self._states_shared = theano.shared(
            np.zeros((batch_size, state_length), dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((batch_size, state_length), dtype=theano.config.floatX))

        self._rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                             broadcastable=(False, True))

        self._actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                      dtype='int32'),
                                             broadcastable=(False, True),
                                             allow_downcast=True)

        self._q_valsA = lasagne.layers.get_output(self._l_outA, State)
        self._q_valsB = lasagne.layers.get_output(self._l_outB, ResultState)

        self._q_func = self._q_valsA[T.arange(batch_size),
                                     Action.reshape((-1, ))].reshape((-1, 1))

        target = (
            Reward +
            # (T.ones_like(terminals) - terminals) *
            self._discount_factor *
            T.max(self._q_valsB, axis=1, keepdims=True))
        diff = target - self._q_valsA[T.arange(batch_size),
                                      Action.reshape((-1, ))].reshape((-1, 1))

        loss = 0.5 * diff**2
        loss = T.mean(loss)

        params = lasagne.layers.helper.get_all_params(self._l_outA)

        givens = {
            State: self._states_shared,
            ResultState: self._next_states_shared,
            Reward: self._rewards_shared,
            Action: self._actions_shared,
        }

        # SGD update
        updates = lasagne.updates.rmsprop(loss, params, self._learning_rate,
                                          self._rho, self._rms_epsilon)
        # TD update
        # updates = lasagne.updates.rmsprop(T.mean(self._q_func), params, self._learning_rate * -T.mean(diff), self._rho,
        #                                      self._rms_epsilon)

        self._train = theano.function([], [loss, self._q_valsA],
                                      updates=updates,
                                      givens=givens)

        self._q_vals = theano.function([],
                                       self._q_valsA,
                                       givens={State: self._states_shared})

        self._bellman_error = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=diff,
            allow_input_downcast=True)
Ejemplo n.º 41
0
    def initialize_network(self):
        """
        :description: this method initializes the network, updates, and theano functions for training and 
            retrieving q values. Here's an outline: 

            1. build the q network and target q network
            2. initialize theano symbolic variables used for compiling functions
            3. initialize the theano numeric variables used as input to functions
            4. formulate the symbolic loss 
            5. formulate the symbolic updates 
            6. compile theano functions for training and for getting q_values
        """
        batch_size, input_shape = self.batch_size, self.input_shape
        lasagne.random.set_rng(self.rng)

        # 1. build the q network and target q network
        self.l_out = self.build_network(input_shape, self.num_actions,
                                        batch_size)
        self.next_l_out = self.build_network(input_shape, self.num_actions,
                                             batch_size)
        self.reset_target_network()

        # 2. initialize theano symbolic variables used for compiling functions
        states = T.tensor4('states')
        actions = T.icol('actions')
        rewards = T.col('rewards')
        next_states = T.tensor4('next_states')
        # terminals are used to indicate a terminal state in the episode and hence a mask over the future
        # q values i.e., Q(s',a')
        terminals = T.icol('terminals')

        # 3. initialize the theano numeric variables used as input to functions
        self.states_shape = (batch_size, ) + (1, ) + input_shape
        self.states_shared = theano.shared(
            np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.next_states_shared = theano.shared(
            np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))
        self.terminals_shared = theano.shared(np.zeros((batch_size, 1),
                                                       dtype='int32'),
                                              broadcastable=(False, True))

        # 4. formulate the symbolic loss
        q_vals = lasagne.layers.get_output(self.l_out, states)
        next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states)
        target = (rewards + (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector'
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1, ))].reshape((-1, 1))

        # a lot of the deepmind work clips the td error at 1 so we do that here
        # the problem is that gradient backpropagating through this minimum node
        # will be zero if diff is larger then 1.0 (because changing params before
        # the minimum does not impact the output of the minimum). To account for
        # this we take the part of the td error (magnitude) greater than 1.0 and simply
        # add it to the loss, which allows gradient to backprop but just linearly
        # in the td error rather than quadratically
        quadratic_part = T.minimum(abs(diff), 1.0)
        linear_part = abs(diff) - quadratic_part
        loss = 0.5 * quadratic_part**2 + linear_part
        loss = T.mean(loss) + self.regularization * regularize_network_params(
            self.l_out, l2)

        # 5. formulate the symbolic updates
        params = lasagne.layers.helper.get_all_params(self.l_out)
        updates = self.initialize_updates(self.update_rule, loss, params,
                                          self.learning_rate)

        # 6. compile theano functions for training and for getting q_values
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        self._train = theano.function([], [loss, q_vals],
                                      updates=updates,
                                      givens=givens)
        self._get_q_values = theano.function(
            [], q_vals, givens={states: self.states_shared})
Ejemplo n.º 42
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        # print "NETWORK---------------------------"
        # print "input width ", self.input_width
        # print "input height", self.input_height
        # print "num actiuons", self.num_actions
        # print "num frames", self.num_frames
        # print "batch size", self.batch_size
        # print "discount", self.discount
        # print "rho", self.rho
        # print "lr", self.lr
        # print "rms_epsilon", self.rms_epsilon
        # print "momentum", self.momentum
        # print "clip_delta", self.clip_delta
        # print "freeze_ intercal", self.freeze_interval
        # print "rng", self.rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Shared variables for training from a minibatch of replayed state transitions,
        # each consisting of num_frames + 1 (due to overlap) images, along with
        # the chosen action and resulting reward and termnial status.
        self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames + 1, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # Shared variable for a single state, to calculate q_vals
        self.state_shared = theano.shared(
            np.zeros((num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        terminalsX = terminals.astype(theano.config.floatX)
        actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
                          actions.reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards +
                  (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        train_givens = {
            states: self.imgs_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)

        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames,
                                               self.input_height,
                                               self.input_width))
        }
        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
Ejemplo n.º 43
0
    def __init__(self,
                 n_time,
                 input_width,
                 input_height,
                 num_hidden,
                 num_LSTM_units,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 batch_size,
                 update_rule,
                 actions,
                 file='',
                 clip_delta=0,
                 input_scale=1.0):
        CompleteLearner.__init__(self, actions, file)
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = len(actions)

        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta

        self.rng = lasagne.random.get_rng()

        self.cycles = 0
        self.batch_size = batch_size

        self.n_time = n_time
        #lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.network = self.build_network(
            (n_time, batch_size, input_width, input_height), num_hidden,
            num_LSTM_units, self.num_actions)

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')

        # Shared variables for training from a minibatch of replayed
        # state transitions, each consisting of num_frames + 1 (due to
        # overlap) images, along with the chosen action and resulting
        # reward (no terminal state)
        self.obss_shared = theano.shared(
            np.zeros((batch_size, n_time, input_height, input_width),
                     dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))
        # no terminal states

        # Shared variable for a single state, to calculate q_vals.
        self.state_shared = theano.shared(
            np.zeros((input_height, input_width), dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.network, states / input_scale)

        next_q_vals = lasagne.layers.get_output(self.network,
                                                next_states / input_scale)
        next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        actionmask = T.eq(
            T.arange(self.num_actions).reshape((1, -1)),
            actions.reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards +
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff**2

        batch_accumulator = 'mean'

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.network)
        train_givens = {
            states: self.obss_shared[:, :-1],  #get all except the last
            next_states: self.obss_shared[:, 1:],  #get all except the first
            rewards: self.rewards_shared,
            actions: self.actions_shared,
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rms_prop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'adam':
            updates = lasagne.updates.adam(loss,
                                           params,
                                           self.lr,
                                           epsilon=self.rms_epsilon)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss],
                                      updates=updates,
                                      givens=train_givens)
        q_givens = {
            states:
            self.state_shared.reshape((self.input_height, self.input_width))
        }
        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
Ejemplo n.º 44
0
    Ytr_su = Ytr_su[:,np.newaxis]
    # get observations and labels for the validation set
    Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX)
    Yva = datasets[2][1].get_value(borrow=False).astype(np.int32)
    Yva = Yva[:,np.newaxis] # numpy is dumb
    # get size information for the data
    un_samples = Xtr_un.shape[0]
    su_samples = Xtr_su.shape[0]
    va_samples = Xva.shape[0]

    # set up some symbolic variables for input to the PeaNetSeq
    Xp = T.matrix('Xp_base')
    Xd = T.matrix('Xd_base')
    Xc = T.matrix('Xc_base')
    Xm = T.matrix('Xm_base')
    Yd = T.icol('Yd_base')
    # set some "shape" parameters for the networks
    data_dim = Xtr_un.shape[1]
    label_dim = 10
    prior_dim = 25
    prior_sigma = 1.0
    batch_size = 100 # we'll take 2x this per batch, for sup and unsup

    #################################################################
    # Construct the generator and inferencer to use for conditional #
    # generation of adversarial examples.                           #
    #################################################################
    # Choose some parameters for the generator network
    gn_params = {}
    gn_config = [prior_dim, 800, 800, data_dim]
    gn_params['mlp_config'] = gn_config
Ejemplo n.º 45
0
    def __init__(self, input_width, input_height, num_channels, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, network_params, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_channels = num_channels
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        self.lstm = None
        self.next_lstm = None

        logging.debug('network parameters', network_params)
        self.network_params = network_params

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        networks = self.build_network(network_type, num_channels, input_width, input_height,
                                        num_actions, num_frames, None)
        if isinstance(networks, tuple):
            self.l_out = networks[0]
            self.lstm = networks[1]
        else:
            self.l_out = networks

        # theano.compile.function_dump('network.dump', self.l_out)
        if self.freeze_interval > 0:
            next_networks = self.build_network(network_type, num_channels, input_width,
                                                 input_height, num_actions,
                                                 num_frames, None)

            if isinstance(next_networks, tuple):
                self.next_l_out = next_networks[0]
                self.next_lstm = next_networks[1]
            else:
                self.next_l_out = next_networks

            self.reset_q_hat()

        # This really really needs to be floats for now.
        # It makes sense if they use it for computations
        btensor5 = T.TensorType(theano.config.floatX, (False,) * 5)
        states = btensor5('states')
        next_states = btensor5('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Apparently needed for some layers with a variable input size
        # Weird, because the others just allow a None batch size,
        # but let's just play safe for now
        # For now, it should always look exactly like states
        # (n_batch, n_time_steps)
        # mask = T.imatrix('mask')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, num_channels, input_height, input_width),
                     dtype=theano.config.floatX), name='states')

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, num_channels, input_height, input_width),
                     dtype=theano.config.floatX), name='next_states')

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True), name='rewards')

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True), name='actions')

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # self.mask_shared = theano.shared(np.ones((batch_size, num_frames),
        #     dtype='int32'))

        # lstmout = lasagne.layers.get_output(self.lstm, states / input_scale)

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
                # mask_input=mask)

        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale
                                                    )
                                                    # mask_input=mask)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale
                                                    )
                                                    # mask_input=mask)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(target.shape[0]),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        # print params
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            update_for = lambda params: deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            update_for = lambda params: lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            update_for = lambda params: lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        updates = update_for(params)

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        # # Super mega shady stuff
        # # Somehow an update sneaks in for cell and hid. Kill it with fire
        if self.lstm:
            delete_keys = [k for k, v in updates.items() if k.name in ['cell', 'hid']]
            # print delete_keys
            for key in delete_keys:
                del updates[key]

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
Ejemplo n.º 46
0
    def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, 
        discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength,
        networkType = "conv", updateRule = "deepmind_rmsprop", batchAccumulator = "sum", clipDelta = 1.0, inputScale = 255.0):
        
        self.batchSize          = batchSize
        self.numFrames          = numFrames
        self.inputWidth         = inputWidth
        self.inputHeight        = inputHeight
        self.inputScale         = inputScale
        self.numActions         = numActions
        self.discountRate       = discountRate
        self.learningRate       = learningRate
        self.rho                = rho
        self.rms_epsilon        = rms_epsilon
        self.momentum           = momentum
        self.networkUpdateDelay = networkUpdateDelay
        self.useSARSAUpdate     = useSARSAUpdate
        self.kReturnLength      = kReturnLength
        self.networkType        = networkType
        self.updateRule         = updateRule
        self.batchAccumulator   = batchAccumulator
        self.clipDelta          = clipDelta
        self.updateCounter      = 0

        states     = T.tensor4("states")
        nextStates = T.tensor4("nextStates")
        rewards    = T.col("rewards")
        actions    = T.icol("actions")
        nextActions= T.icol("nextActions")
        terminals  = T.icol("terminals")

        self.statesShared      = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX))
        self.nextStatesShared  = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX))
        self.rewardsShared     = theano.shared(np.zeros((self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True))
        self.actionsShared     = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))
        self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))
        self.terminalsShared   = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))

        self.qValueNetwork  = DeepNetworks.buildDeepQNetwork(
            self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType)

        qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale)

        if self.networkUpdateDelay > 0:
            self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork(
                self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType)
            self.resetNextQValueNetwork()
            nextQValues = lasagne.layers.get_output(self.nextQValueNetwork, nextStates / self.inputScale)

        else:
            nextQValues = lasagne.layers.get_output(self.qValueNetwork, nextStates / self.inputScale)
            nextQValues = theano.gradient.disconnected_grad(nextQValues)


        if self.useSARSAUpdate:
            target = rewards + terminals * (self.discountRate ** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape((-1,))].reshape((-1, 1))
        else:
            target = rewards + terminals * (self.discountRate ** self.kReturnLength) * T.max(nextQValues, axis = 1, keepdims = True)

        targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1,))].reshape((-1, 1))


        quadraticPart = T.minimum(abs(targetDifference), self.clipDelta)
        linearPart = abs(targetDifference) - quadraticPart

        # if self.clipDelta > 0:
        #     targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta)

        if self.batchAccumulator == "sum":
            # loss = T.sum(targetDifference ** 2)
            loss = T.sum(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart)
        elif self.batchAccumulator == "mean":
            # loss = T.mean(targetDifference ** 2)
            loss = T.mean(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart)
        else:
            raise ValueError("Bad Network Accumulator. {sum, mean} expected")


        networkParameters = lasagne.layers.helper.get_all_params(self.qValueNetwork)

        if self.updateRule == "deepmind_rmsprop":
            updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon)
        elif self.updateRule == "rmsprop":
            updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon)
        elif self.updateRule == "sgd":
            updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate)
        else:
            raise ValueError("Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected")

        if self.momentum > 0:
            updates.lasagne.updates.apply_momentum(updates, None, self.momentum)

        lossGivens = {
            states: self.statesShared,
            nextStates: self.nextStatesShared,
            rewards:self.rewardsShared,
            actions: self.actionsShared,
            nextActions: self.nextActionsShared,
            terminals: self.terminalsShared
        }

        self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn')
        self.__computeQValues = theano.function([], qValues, givens={states: self.statesShared})
    def __init__(self,
                 input_width,
                 input_height,
                 num_actions,
                 num_frames,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 clip_delta,
                 freeze_interval,
                 batch_size,
                 network_type,
                 update_rule,
                 batch_accumulator,
                 rng,
                 input_scale=255.0,
                 double=False,
                 transition_length=4):

        if double:
            print 'USING DOUBLE DQN'
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width,
                                        input_height, num_actions, num_frames,
                                        batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states_t')
        actions = T.icol('actions_t')
        target = T.col('evaluation_t')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))
        self.target_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                           broadcastable=(False, True))

        self.states_transition_shared = theano.shared(
            np.zeros((batch_size, transition_length * 2, num_frames,
                      input_height, input_width),
                     dtype=theano.config.floatX))
        self.states_one_shared = theano.shared(
            np.zeros((num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        """get Q(s)   batch_size = 1 """
        q1_givens = {
            states:
            self.states_one_shared.reshape(
                (1, self.num_frames, self.input_height, self.input_width))
        }
        self._q1_vals = theano.function([], q_vals[0], givens=q1_givens)
        """get Q(s)   batch_size = batch size """
        q_batch_givens = {
            states:
            self.states_shared.reshape((self.batch_size, self.num_frames,
                                        self.input_height, self.input_width))
        }
        self._q_batch_vals = theano.function([], q_vals, givens=q_batch_givens)

        action_mask = T.eq(
            T.arange(num_actions).reshape((1, -1)), actions.reshape(
                (-1, 1))).astype(theano.config.floatX)

        q_s_a = (q_vals * action_mask).sum(axis=1).reshape((-1, 1))
        """ get Q(s,a)   batch_size = batch size """
        q_s_a_givens = {
            states:
            self.states_shared.reshape((self.batch_size, self.num_frames,
                                        self.input_height, self.input_width)),
            actions:
            self.actions_shared
        }
        self._q_s_a_vals = theano.function([], q_s_a, givens=q_s_a_givens)

        if self.freeze_interval > 0:
            q_target_vals = lasagne.layers.get_output(self.next_l_out,
                                                      states / input_scale)
        else:
            q_target_vals = lasagne.layers.get_output(self.l_out,
                                                      states / input_scale)
            q_target_vals = theano.gradient.disconnected_grad(q_target_vals)

        if not double:
            q_target = T.max(q_target_vals, axis=1)
        else:
            greedy_actions = T.argmax(q_vals, axis=1)
            q_target_mask = T.eq(
                T.arange(num_actions).reshape((1, -1)),
                greedy_actions.reshape((-1, 1)).astype(theano.config.floatX))
            q_target = (q_target_vals * q_target_mask).sum(axis=1).reshape(
                (-1, 1))
        """get Q target Q'(s,a') for a batch of transitions  batch size = batch_size * transition length"""
        q_target_transition_givens = {
            states:
            self.states_transition_shared.reshape(
                (batch_size * transition_length * 2, self.num_frames,
                 self.input_height, self.input_width))
        }
        self._q_target = theano.function([],
                                         q_target.reshape(
                                             (batch_size,
                                              transition_length * 2)),
                                         givens=q_target_transition_givens)
        """get Q target_vals Q'(s) for a batch of transitions  batch size = batch_size * transition length"""
        self._q_target_vals = theano.function(
            [],
            q_target_vals.reshape(
                (batch_size, transition_length * 2, num_actions)),
            givens=q_target_transition_givens)

        diff = q_s_a - target

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff**2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)
        """Q(s,a) target train()"""
        train_givens = {
            states: self.states_shared,
            actions: self.actions_shared,
            target: self.target_shared
        }
        self._train = theano.function([], [loss],
                                      updates=updates,
                                      givens=train_givens,
                                      on_unused_input='warn')

        self._train2 = theano.function([], [loss],
                                       updates=updates,
                                       givens=train_givens,
                                       on_unused_input='warn')
Ejemplo n.º 48
0
    def __init__(self, rng=None, Xd=None, \
            g_net=None, i_net=None, pn_seq=None, \
            data_dim=None, prior_dim=None, \
            params=None):
        # setup a rng for this AEDPair
        self.rng = RandStream(rng.randint(100000))

        if (params is None):
            self.params = {}
        else:
            self.params = params
        if 'match_type' in params:
            self.match_type = params['match_type']
        else:
            self.match_type = 'grad_sign'
        # we can only try to match sign or direction...
        assert((self.match_type == 'grad_dir') or \
                (self.match_type == 'grad_sign'))
        if self.match_type == 'grad_dir':
            # we match the direction of the gradient under the assumption
            # of gaussian observation noise
            self.mean_transform = lambda x: max_normalize(x, axis=1)
            assert(g_net.out_type == 'gaussian')
        else:
            # we match the sign of the gradient as if it were a collection
            # of independent binary variables
            self.mean_transform = lambda x: 2.0 * (x - 0.5)
            assert(g_net.out_type == 'bernoulli')

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this AEDPair
        self.Xd = Xd
        self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq
        self.Xc = 0.0 * self.Xd
        self.Xm = 0.0 * self.Xd
        self.obs_count = T.cast(Xd.shape[0], 'floatX')

        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)
        self.policy_mean = self.IN.output_mean
        self.policy_logvar = self.IN.output_logvar
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        # set up a var for controlling the max-norm bound on perturbations
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lam_mnb = theano.shared(value=zero_ary, \
                name='adp_lam_mnb')
        self.set_lam_mnb(lam_mnb=0.1)

        # get the perturbations output by the generator network
        self.Pg = self.mean_transform(self.GN.output)
        if self.match_type == 'grad_dir':
            # samples because we're matching gradient via squared error
            self.Pg_samples = self.mean_transform(self.GN.output_samples)
        else:
            # no samples, because we're matching gradient sign
            self.Pg_samples = self.mean_transform(self.GN.output)

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)

        # make a clone of the target PeaNetSeq that takes perturbed inputs
        self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \
                seq_Xd=[self.Xd, self.Xd], seq_Yd=[self.Yd, self.Yd], \
                no_funcs=True)
        self.grad_pea_Xd = T.grad(self.PNS.joint_cost, self.Xd)
        if self.match_type == 'grad_dir':
            # turn gradient into a unit max-normalized vector
            self.match_target = max_normalize(self.grad_pea_Xd)
        else:
            # transform gradient into binary indicators of sign
            self.match_target = (self.grad_pea_Xd > 0.0)
        # get the symbolic vars for passing inputs to self.PNS
        self.Xd_seq = self.PNS.Xd_seq
        self.Yd_seq = self.PNS.Yd_seq
        self.seq_inputs = self.Xd_seq + self.Yd_seq

        # shared var learning rate for generator and inferencer
        self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='adp_it_count')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv')
        self.set_lam_adv(lam_adv=1.0)
        # init shared vars for weighting a penalty on the norms of our learned
        # policies and a reward to encourage maximizing their entropy.
        self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld')
        self.set_lam_kld(lam_kld=0.1)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w')
        self.set_lam_l2w(1e-4)

        # Grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.adv_cost = self.lam_adv[0] * self._construct_adv_cost()
        self.kld_cost = self.lam_kld[0] * self._construct_kld_cost()
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.adv_cost + self.kld_cost + \
                self.other_reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1)

        # Construct the updates for the generator and inferencer networks
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]

        # Construct a function for jointly training the generator/inferencer
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the outputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_from_gn = self.GN.sample_from_model
        self.sample_from_Xd = self._construct_sample_from_Xd()
        return
Ejemplo n.º 49
0
    def __init__(self, num_actions):
        
        # remember parameters
        self.num_actions = num_actions
        self.batch_size = BATCH_SIZE
        self.discount_rate = DISCOUNT_RATE
        self.history_length = HISTORY_LENGTH
        self.screen_dim = DIMS
        self.img_height = SCREEN_HEIGHT
        self.img_width = SCREEN_WIDTH
        self.clip_error = CLIP_ERROR
        self.input_color_scale = COLOR_SCALE

        self.target_steps = TARGET_STEPS
        self.train_iterations = TRAIN_STEPS
        self.train_counter = 0
        self.momentum = MOMENTUM
        self.update_rule = UPDATE_RULE
        self.learning_rate = LEARNING_RATE
        self.rms_decay = RMS_DECAY
        self.rms_epsilon = RMS_EPSILON        
        
        self.rng = np.random.RandomState(RANDOM_SEED)

        # set seed
        lasagne.random.set_rng(self.rng)

        # prepare tensors once and reuse them
        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        # terminals are bool for our case
        terminals = T.bcol('terminals')

        # create shared theano variables
        self.states_shared = theano.shared(
            np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
                     dtype=theano.config.floatX))

        # !broadcast ?
        self.rewards_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            #np.zeros((self.batch_size, 1), dtype='int32'),
            np.zeros((self.batch_size, 1), dtype='int8'),
            broadcastable=(False, True))

        # can add multiple nets here
        self.l_primary = self.build_network()

        if self.target_steps > 0:
            self.l_secondary = self.build_network()
            self.copy_to_secondary()

        
        """
        # input scale i.e. division can be applied to input directly also to normalize
        """

        # define output symbols
        q_vals = lasagne.layers.get_output(self.l_primary, states / self.input_color_scale)
        
        if self.target_steps > 0:
            q_vals_secondary = lasagne.layers.get_output(self.l_secondary, next_states / self.input_color_scale)
        else:
            # why this ?
            q_vals_secondary = lasagne.layers.get_output(self.l_primary, next_states / self.input_color_scale)
            q_vals_secondary = theano.gradient.disconnected_grad(q_vals_secondary)

        # target = r + max
        target = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(q_vals_secondary, axis=1, keepdims=True))
        
        """
        # check what this does
        """
        diff = target - q_vals[T.arange(self.batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        # print shape ? 

        if self.clip_error > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_error)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_error * linear_part
        else:
            loss = 0.5 * diff ** 2

        loss = T.sum(loss)
        
        params = lasagne.layers.helper.get_all_params(self.l_primary)  
        
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        g_time = time.time()
        logger.info("graph compiling")


        if self.update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.learning_rate, self.rms_decay,
                                       self.rms_epsilon)
        elif self.update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.learning_rate, self.rms_decay,
                                              self.rms_epsilon)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})

        logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
Ejemplo n.º 50
0
    def __init__(self,
                 input_width,
                 input_height,
                 num_actions,
                 num_frames,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 freeze_interval,
                 batch_size,
                 network_type,
                 update_rule,
                 batch_accumulator,
                 input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.gamma = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.freeze_interval = freeze_interval

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width,
                                        input_height, num_actions, num_frames,
                                        batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        #terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        # self.terminals_shared = theano.shared(
        #     np.zeros((batch_size, 1), dtype='int32'),
        #     broadcastable=(False,True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = rewards + self.gamma * T.max(
            next_q_vals, axis=1, keepdims=True)
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1, ))].reshape((-1, 1))

        if batch_accumulator == 'sum':
            loss = T.sum(diff**2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff**2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            #terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals],
                                      updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([],
                                       q_vals,
                                       givens={states: self.states_shared})
Ejemplo n.º 51
0
    def __init__(self, rng=None, Xd=None, \
            g_net=None, i_net=None, pn_seq=None, \
            data_dim=None, prior_dim=None, \
            params=None):
        # setup a rng for this ADPair
        self.rng = RandStream(rng.randint(100000))

        if (params is None):
            self.params = {}
        else:
            self.params = params
        if 'mean_transform' in self.params:
            # apply a user-defined transform to the GenNet output prior to
            # rescaling by self.lam_mnb...
            self.mean_transform = self.params['mean_transform']
        else:
            # default transform is sigmoid -> shift -> scale so that
            # perturbations (for each dimension) are in range -1 --> 1.
            self.mean_transform = lambda x: 2.0 * (apply_sigmoid(x) - 0.5)

        # record the symbolic variables that will provide inputs to the
        # computation graph created to describe this ADPair
        self.Xd = Xd
        self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq
        self.Xc = 0.0 * self.Xd
        self.Xm = 0.0 * self.Xd
        self.obs_count = T.cast(Xd.shape[0], 'floatX')

        # create a "shared-parameter" clone of the inferencer, set up to
        # receive input from the appropriate symbolic variables.
        self.IN = i_net.shared_param_clone(rng=rng, \
                Xd=self.Xd, Xc=self.Xc, Xm=self.Xm)
        # capture a handle for samples from the variational posterior
        self.Xp = self.IN.output
        # create a "shared-parameter" clone of the generator, set up to
        # receive input from samples from the variational posterior
        self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output)
        assert(self.GN.out_type == 'gaussian') # check for right output
        # set up a var for controlling the max-norm bound on perturbations
        zero_ary = np.zeros((1,)).astype(theano.config.floatX)
        self.lam_mnb = theano.shared(value=zero_ary, \
                name='adp_lam_mnb')
        self.set_lam_mnb(lam_mnb=0.1)

        # rescale the perturbations, to make them adjustably norm-bounded
        self.Xg = self.lam_mnb[0] * self.mean_transform(self.GN.output_mean)

        # record and validate the data dimensionality parameters
        self.data_dim = data_dim
        self.prior_dim = prior_dim
        # output of the generator and input to the inferencer should both be
        # equal to self.data_dim
        assert(self.data_dim == self.GN.mlp_layers[-1].out_dim)
        assert(self.data_dim == self.IN.shared_layers[0].in_dim)
        # input of the generator and mu/sigma outputs of the inferencer should
        # both be equal to self.prior_dim
        assert(self.prior_dim == self.GN.mlp_layers[0].in_dim)
        assert(self.prior_dim == self.IN.mu_layers[-1].out_dim)
        assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim)

        # make a clone of the target PeaNetSeq that takes perturbed inputs
        self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \
                seq_Xd=[self.Xd, (self.Xd + self.Xg)])
        # get the symbolic vars for passing inputs to self.PNS
        self.Xd_seq = self.PNS.Xd_seq
        self.Yd_seq = self.PNS.Yd_seq
        self.seq_inputs = self.Xd_seq + self.Yd_seq

        # shared var learning rate for generator and inferencer
        self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn')
        self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in')
        # shared var momentum parameters for generator and inferencer
        self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1')
        self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2')
        self.it_count = theano.shared(value=zero_ary, name='adp_it_count')
        # init parameters for controlling learning dynamics
        self.set_all_sgd_params()
        # init shared var for weighting nll of data given posterior sample
        self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv')
        self.set_lam_adv(lam_adv=1.0)
        # init shared var for weighting Gaussian prior over the policy
        self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld')
        self.set_lam_kld(lam_kld=1.0)
        # init shared var for controlling l2 regularization on params
        self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w')
        self.set_lam_l2w(1e-4)

        # Grab the full set of "optimizable" parameters from the generator
        # and inferencer networks that we'll be working with.
        self.in_params = [p for p in self.IN.mlp_params]
        self.gn_params = [p for p in self.GN.mlp_params]
        self.joint_params = self.in_params + self.gn_params

        ###################################
        # CONSTRUCT THE COSTS TO OPTIMIZE #
        ###################################
        self.adv_cost = self.lam_adv[0] * self._construct_adv_cost()
        self.kld_cost = self.lam_kld[0] * self._construct_kld_cost()
        self.other_reg_cost = self._construct_other_reg_cost()
        self.joint_cost = self.adv_cost + self.kld_cost + \
                self.other_reg_cost

        # Get the gradient of the joint cost for all optimizable parameters
        self.joint_grads = OrderedDict()
        for p in self.joint_params:
            self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.05, 0.05)

        # Construct the updates for the generator and inferencer networks
        self.gn_updates = get_adam_updates(params=self.gn_params, \
                grads=self.joint_grads, alpha=self.lr_gn, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.in_updates = get_adam_updates(params=self.in_params, \
                grads=self.joint_grads, alpha=self.lr_in, \
                beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \
                mom2_init=1e-3, smoothing=1e-8)
        self.joint_updates = OrderedDict()
        for k in self.gn_updates:
            self.joint_updates[k] = self.gn_updates[k]
        for k in self.in_updates:
            self.joint_updates[k] = self.in_updates[k]

        # Construct a function for jointly training the generator/inferencer
        self.train_joint = self._construct_train_joint()

        # Construct a function for computing the outputs of the generator
        # network for a batch of noise. Presumably, the noise will be drawn
        # from the same distribution that was used in training....
        self.sample_from_gn = self.GN.sample_from_model
        self.sample_from_Xd = self._construct_sample_from_Xd()
        return
Ejemplo n.º 52
0
    def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, network_type=None, update_rule="rmsprop", batch_accumulator="sum", random_state=np.random.RandomState(), double_Q=False, neural_network=NN):
        """ Initialize environment
        
        """
        QNetwork.__init__(self,environment, batch_size)
        
        self._rho = rho
        self._rms_epsilon = rms_epsilon
        self._momentum = momentum
        self._clip_delta = clip_delta
        self._freeze_interval = freeze_interval
        self._double_Q = double_Q
        self._random_state = random_state
        
        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._input_dimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._input_dimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        Q_net=neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state)
        self.q_vals, self.params, shape_after_conv = Q_net._buildDQN(states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_q_vals, self.next_params, shape_after_conv = Q_net._buildDQN(next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        
        
        if(self._double_Q==True):
            givens_next={}
            for i, x in enumerate(self.next_states_shared):
                givens_next[ states[i] ] = x

            self.next_q_vals_current_qnet=theano.function([], self.q_vals,
                                          givens=givens_next)

            next_q_curr_qnet = theano.clone(self.next_q_vals)

            argmax_next_q_vals=T.argmax(next_q_curr_qnet, axis=1, keepdims=True)

            max_next_q_vals=self.next_q_vals[T.arange(batch_size),argmax_next_q_vals.reshape((-1,))].reshape((-1, 1))

        else:
            max_next_q_vals=T.max(self.next_q_vals, axis=1, keepdims=True)


        not_terminals=T.ones_like(terminals) - terminals

        target = rewards + not_terminals * thediscount * max_next_q_vals

        q_val=self.q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
        # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32...
        diff = - q_val + target 

        if self._clip_delta > 0:
            # This loss function implementation is taken from
            # https://github.com/spragunr/deep_q_rl
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self._clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss_ind = 0.5 * quadratic_part ** 2 + self._clip_delta * linear_part
        else:
            loss_ind = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss_ind)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss_ind)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
                
        gparams=[]
        for p in self.params:
            gparam =  T.grad(loss, p)
            gparams.append(gparam)

        updates = []
        
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self._rho,
                                       self._rms_epsilon)
        elif update_rule == 'rmsprop':
            for i,(p, g) in enumerate(zip(self.params, gparams)):                
                acc = theano.shared(p.get_value() * 0.)
                acc_new = rho * acc + (1 - self._rho) * g ** 2
                gradient_scaling = T.sqrt(acc_new + self._rms_epsilon)
                g = g / gradient_scaling
                updates.append((acc, acc_new))
                updates.append((p, p - thelr * g))

        elif update_rule == 'sgd':
            for i, (param, gparam) in enumerate(zip(self.params, gparams)):
                updates.append((param, param - thelr * gparam))
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))
    
        
        if(self._double_Q==True):
            self._train = theano.function([thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        else:
            self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], self.q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
Ejemplo n.º 53
0
    def __init__(self,
                 num_actions,
                 phi_length,
                 width,
                 height,
                 discount=.9,
                 learning_rate=.01,
                 batch_size=32,
                 approximator='none'):
        self._batch_size = batch_size
        self._num_input_features = phi_length
        self._phi_length = phi_length
        self._img_width = width
        self._img_height = height
        self._discount = discount
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.scale_input_by = 255.0

        print "neural net initialization, lr is: ", self.learning_rate, approximator

        # CONSTRUCT THE LAYERS
        self.q_layers = []
        self.q_layers.append(
            layers.Input2DLayer(self._batch_size, self._num_input_features,
                                self._img_height, self._img_width,
                                self.scale_input_by))

        if approximator == 'cuda_conv':
            self.q_layers.append(
                cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1]))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=16,
                                                 filter_size=8,
                                                 stride=4,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=32,
                                                 filter_size=4,
                                                 stride=2,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1]))

        elif approximator == 'conv':
            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=16,
                                          filter_width=8,
                                          filter_height=8,
                                          stride_x=4,
                                          stride_y=4,
                                          weights_std=.01,
                                          init_bias_value=0.01))

            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=32,
                                          filter_width=4,
                                          filter_height=4,
                                          stride_x=2,
                                          stride_y=2,
                                          weights_std=.01,
                                          init_bias_value=0.01))
        if approximator == 'cuda_conv' or approximator == 'conv':

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=256,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.rectify))

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=num_actions,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.identity))

        if approximator == 'none':
            self.q_layers.append(\
                layers.DenseLayerNoBias(self.q_layers[-1],
                                        n_outputs=num_actions,
                                        weights_std=0.00,
                                        dropout=0,
                                        nonlinearity=layers.identity))

        self.q_layers.append(layers.OutputLayer(self.q_layers[-1]))

        for i in range(len(self.q_layers) - 1):
            print self.q_layers[i].get_output_shape()

        # Now create a network (using the same weights)
        # for next state q values
        self.next_layers = copy_layers(self.q_layers)
        self.next_layers[0] = layers.Input2DLayer(self._batch_size,
                                                  self._num_input_features,
                                                  self._img_width,
                                                  self._img_height,
                                                  self.scale_input_by)
        self.next_layers[1].input_layer = self.next_layers[0]

        self.rewards = T.col()
        self.actions = T.icol()

        # Build the loss function ...
        print "building loss funtion"
        q_vals = self.q_layers[-1].predictions()
        next_q_vals = self.next_layers[-1].predictions()
        next_maxes = T.max(next_q_vals, axis=1, keepdims=True)
        target = self.rewards + discount * next_maxes
        target = theano.gradient.consider_constant(target)
        diff = target - q_vals
        # Zero out all entries for actions that were not chosen...
        mask = build_mask(T.zeros_like(diff), self.actions, 1.0)
        diff_masked = diff * mask
        error = T.mean(diff_masked**2)
        self._loss = error * diff_masked.shape[1]  #

        self._parameters = layers.all_parameters(self.q_layers[-1])

        self._idx = T.lscalar('idx')

        # CREATE VARIABLES FOR INPUT AND OUTPUT
        self.states_shared = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.states_shared_next = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (1, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'),
                                            broadcastable=(False, True))

        self._givens = \
            {self.q_layers[0].input_var:
             self.states_shared[self._idx*self._batch_size:
                                (self._idx+1)*self._batch_size, :, :, :],
             self.next_layers[0].input_var:
             self.states_shared_next[self._idx*self._batch_size:
                                     (self._idx+1)*self._batch_size, :, :, :],

             self.rewards:
             self.rewards_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :],
             self.actions:
             self.actions_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :]
             }

        self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\
            self._loss, self._parameters, learning_rate=self.learning_rate,
            rho=0.9, momentum=0.9, epsilon=1e-6)

        self._train = theano.function([self._idx],
                                      self._loss,
                                      givens=self._givens,
                                      updates=self._updates)
        self._compute_loss = theano.function([self._idx],
                                             self._loss,
                                             givens=self._givens)
        self._compute_q_vals = \
            theano.function([self.q_layers[0].input_var],
                            self.q_layers[-1].predictions(),
                            on_unused_input='ignore')
Ejemplo n.º 54
0
def manifold_walk_regularization():

    for t_num in range(10):
	    out_file = open("MWR_TEST_RESULTS_{0:d}.txt".format(t_num), 'wb')

	    # Initialize a source of randomness
	    rng = np.random.RandomState(t_num)

	    # Load some data to train/validate/test with
	    sup_count = 600
	    dataset = 'data/mnist.pkl.gz'
	    datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False)
	    Xtr_su = datasets[0][0].get_value(borrow=False)
	    Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32)
	    Xtr_un = datasets[1][0].get_value(borrow=False)
	    Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32)

	    # get the joint labeled and unlabeled data
	    Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX)
	    Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]])
	    Ytr_un = 0 * Ytr_un # KEEP CATS FIXED OR FREE? YES/NO?
	    Xtr_mean = np.mean(Xtr_un, axis=0, keepdims=True)
	    # get the labeled data
	    Xtr_su = Xtr_su.astype(theano.config.floatX)
	    Ytr_su = Ytr_su[:,np.newaxis]
	    # get observations and labels for the validation set
	    Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX)
	    Yva = datasets[2][1].get_value(borrow=False).astype(np.int32)
	    Yva = Yva[:,np.newaxis] # numpy is dumb
	    # get observations and labels for the test set
	    Xte = datasets[3][0].get_value(borrow=False).astype(theano.config.floatX)
	    Yte = datasets[3][1].get_value(borrow=False).astype(np.int32)
	    Yte = Yte[:,np.newaxis] # numpy is dumb
	    # get size information for the data and training batches
	    un_samples = Xtr_un.shape[0]
	    su_samples = Xtr_su.shape[0]
	    va_samples = Xva.shape[0]
	    data_dim = Xtr_su.shape[1]
	    label_dim = 10
	    batch_size = 100

	    # Symbolic inputs
	    Xd = T.matrix(name='Xd')
	    Xc = T.matrix(name='Xc')
	    Xm = T.matrix(name='Xm')
	    Xt = T.matrix(name='Xt')
	    Xp = T.matrix(name='Xp')
	    Yd = T.icol('Yd')

	    # Load inferencer and generator from saved parameters
	    gn_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_GN.pkl"
	    in_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_IN.pkl"
	    IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd)
	    GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp)
	    IN.set_sigma_scale(1.3)
	    prior_dim = GN.latent_dim

	    MCS = MCSampler(rng=rng, Xd=Xd, i_net=IN, g_net=GN, chain_len=2, \
	                    data_dim=data_dim, prior_dim=prior_dim)
	    full_chain_len = MCS.chain_len + 1

	    # setup "chain" versions of the labeled/unlabeled/validate sets
	    Xtr_su_chains = [Xtr_su.copy() for i in range(full_chain_len)]
	    Xtr_un_chains = [Xtr_un.copy() for i in range(full_chain_len)]
	    Ytr_su_chains = [Ytr_su for i in range(full_chain_len)]
	    Ytr_un_chains = [Ytr_un for i in range(full_chain_len)]
	    Xva_chains = [Xva for i in range(full_chain_len)]
	    Yva_chains = [Yva for i in range(full_chain_len)]

	    # downsample, to feed less into the PNS
	    Xtr_su_short = downsample_chains(Xtr_su_chains, stride=1)
	    Xtr_un_short = downsample_chains(Xtr_un_chains, stride=1)
	    Ytr_su_short = downsample_chains(Ytr_su_chains, stride=1)
	    Ytr_un_short = downsample_chains(Ytr_un_chains, stride=1)
	    Xva_short = downsample_chains(Xva_chains, stride=1)
	    Yva_short = downsample_chains(Yva_chains, stride=1)
	    short_chain_len = len(Xtr_su_short)
	    print("REGULARIZATION CHAIN STEPS: {0:d}".format(short_chain_len))

	    # choose some parameters for the categorical inferencer
	    pn_params = {}
	    pc0 = [data_dim, 800, 800, label_dim]
	    pn_params['proto_configs'] = [pc0]
	    # Set up some spawn networks
	    sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True}
	    pn_params['spawn_configs'] = [ sc0 ]
	    pn_params['spawn_weights'] = [ 1.0 ]
	    # Set remaining params
	    pn_params['activation'] = relu_actfun
	    pn_params['init_scale'] = 0.5
	    pn_params['lam_l2a'] = 1e-3
	    pn_params['vis_drop'] = 0.2
	    pn_params['hid_drop'] = 0.5

	    # Initialize the base network for this PNSeq
	    PN = PeaNet(rng=rng, Xd=Xd, params=pn_params)
	    PN.init_biases(0.1)

	    print("Initializing PNS...")
	    # Initialize the PeaNetSeq
	    PNS = PeaNetSeq(rng=rng, pea_net=PN, seq_len=short_chain_len, \
	    		seq_Xd=None, params=None)

	    # set weighting parameters for the various costs...
	    PNS.set_lam_class(1.0)
	    PNS.set_lam_pea_su(0.0)
	    PNS.set_lam_pea_un(2.0)
	    PNS.set_lam_ent(0.0)
	    PNS.set_lam_l2w(1e-5)

	    learn_rate = 0.05
	    PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999)
	    for i in range(300000):
	        if i < 5000:
	            scale = float(i + 1) / 5000.0
	        if ((i+1 % 100000) == 0):
	            learn_rate = learn_rate * 0.5
	        if ((i % 250) == 0):
	        	Xtr_su_chains = resample_chain_steps(MCS, Xtr_su_chains)
	        	Xtr_un_chains = resample_chain_steps(MCS, Xtr_un_chains)
	        	Xtr_su_short = downsample_chains(Xtr_su_chains, stride=1)
	        	Xtr_un_short = downsample_chains(Xtr_un_chains, stride=1)
	        # get some data to train with
	        su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,))
	        xsuc = [(x.take(su_idx, axis=0) - Xtr_mean) for x in Xtr_su_short]
	        ysuc = [y.take(su_idx, axis=0) for y in Ytr_su_short]
	        un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,))
	        xunc = [(x.take(un_idx, axis=0) - Xtr_mean) for x in Xtr_un_short]
	        yunc = [y.take(un_idx, axis=0) for y in Ytr_un_short]
	        Xb_chains = [np.vstack((xsu, xun)) for (xsu, xun) in zip(xsuc, xunc)]
	        Yb_chains = [np.vstack((ysu, yun)) for (ysu, yun) in zip(ysuc, yunc)]
	        # set learning parameters for this update
	        PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999)
	        # do a minibatch update of all PeaNet parameters
	        outputs = PNS.train_joint(*(Xb_chains + Yb_chains))
	        joint_cost = 1.0 * outputs[0]
	        class_cost = 1.0 * outputs[1]
	        pea_cost = 1.0 * outputs[2]
	        ent_cost = 1.0 * outputs[3]
	        other_reg_cost = 1.0 * outputs[4]
	        assert(not (np.isnan(joint_cost)))
	        if ((i % 500) == 0):
	            o_str = "batch: {0:d}, joint: {1:.4f}, class: {2:.4f}, pea: {3:.4f}, ent: {4:.4f}, other_reg: {5:.4f}".format( \
	                    i, joint_cost, class_cost, pea_cost, ent_cost, other_reg_cost)
	            print(o_str)
	            out_file.write(o_str+"\n")
	            out_file.flush()
	            # check classification error on training and validation set
	            train_err = PNS.classification_error(Xtr_su-Xtr_mean, Ytr_su)
	            va_err = PNS.classification_error(Xva-Xtr_mean, Yva)
	            o_str = "    tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err)
	            print(o_str)
	            out_file.write(o_str+"\n")
	            out_file.flush()
	        if ((i % 1000) == 0):
	            # draw the main PeaNet's first-layer filters/weights
	            file_name = "MWR_PN_WEIGHTS.png".format(i)
	            utils.visualize_net_layer(PNS.PN.proto_nets[0][0], file_name)
	    print("TESTING COMPLETE!")
Ejemplo n.º 55
0
    def __init__(self, input, n_in, n_out):

        hidden_size = 36
        batch_size = 32
        self._w_h = init_weights((n_in, hidden_size))
        self._b_h = init_b_weights((1, hidden_size))
        # self._b_h = init_b_weights((hidden_size,))
        self._w_h2 = init_weights((hidden_size, hidden_size))
        self._b_h2 = init_b_weights((1, hidden_size))
        # self._b_h2 = init_b_weights((hidden_size,))
        # self._w_o = init_tanh(hidden_size, n_out)
        self._w_o = init_weights((hidden_size, n_out))
        self._b_o = init_b_weights((1, n_out))
        # self._b_o = init_b_weights((n_out,))

        self.updateTargetModel()
        self._w_h_old = init_weights((n_in, hidden_size))
        self._w_h2_old = init_weights((hidden_size, hidden_size))
        self._w_o_old = init_tanh(hidden_size, n_out)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = 0.00025
        self._discount_factor = 0.99

        self._weight_update_steps = 5000
        self._updates = 0

        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, 2)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size, 2)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.icol("Action")
        Action.tag.test_value = np.zeros((batch_size, 1),
                                         dtype=np.dtype('int32'))
        # Q_val = T.fmatrix()

        # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1)))
        # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True)
        py_x = self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2,
                          self._w_o, self._b_o, 0.0, 0.0)
        y_pred = T.argmax(py_x, axis=1)
        q_func = T.mean((self.model(State, self._w_h, self._b_h, self._w_h2,
                                    self._b_h2, self._w_o, self._b_o, 0.0,
                                    0.0))[T.arange(batch_size),
                                          Action.reshape((-1, ))].reshape(
                                              (-1, 1)))
        # q_val = py_x
        # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self._L1 = (abs(self._w_h).sum() + abs(self._w_h2).sum() +
                    abs(self._w_o).sum())
        self._L1_reg = 0.0
        self._L2_reg = 0.001
        # L2 norm ; one regularization option is to enforce
        # L2 norm to be small
        self._L2 = ((self._w_h**2).sum() + (self._w_h2**2).sum() +
                    (self._w_o**2).sum())

        # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
        # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State))
        delta = ((Reward + (self._discount_factor * T.max(self.model(
            ResultState, self._w_h_old, self._b_h_old, self._w_h2_old,
            self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5),
                                                          axis=1,
                                                          keepdims=True))) -
                 (self.model(State, self._w_h, self._b_h, self._w_h2,
                             self._b_h2, self._w_o, self._b_o, 0.2,
                             0.5))[T.arange(Action.shape[0]),
                                   Action.reshape((-1, ))].reshape((-1, 1)))
        # bellman_cost = T.mean( 0.5 * ((delta) ** 2 ))
        bellman_cost = T.mean(0.5 * ((delta)**2)) + (
            self._L2_reg * self._L2) + (self._L1_reg * self._L1)

        params = [
            self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o
        ]
        # updates = sgd(bellman_cost, params, lr=self._learning_rate)
        # updates = rlTDSGD(q_func, T.mean(delta), params, lr=self._learning_rate)
        # updates = RMSprop(bellman_cost, params, lr=self._learning_rate)
        # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate)
        # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01)
        updates = lasagne.updates.rmsprop(q_func, params,
                                          self._learning_rate * -T.mean(delta),
                                          0.95, 0.01)

        self._train = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=bellman_cost,
            updates=updates,
            allow_input_downcast=True)
        self._predict = theano.function(inputs=[State],
                                        outputs=y_pred,
                                        allow_input_downcast=True)
        self._q_values = theano.function(inputs=[State],
                                         outputs=py_x,
                                         allow_input_downcast=True)
        self._bellman_error = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=delta,
            allow_input_downcast=True)
Ejemplo n.º 56
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, 
                 update_rule, batch_accumulator, randomState, frame_scale=255.0):
        """ Initialize environment

        Arguments:
            environment - the environment (class Env) 
            num_elements_in_batch - list of k integers for the number of each element kept as belief state
            num_actions - int
            discount - float
            learning_rate - float
            rho, rms_epsilon, momentum - float, float, float
            ...
            network_type - string 
            ...           
        """

        self._environment = environment
        
        self._batchSize = batchSize
        self._inputDimensions = self._environment.inputDimensions()
        self._nActions = self._environment.nActions()
        self._df = 0
        self.rho = rho
        self._lr = 0
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._randomState = randomState
        
        lasagne.random.set_rng(self._randomState)

        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._inputDimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batchSize, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))


        q_vals = lasagne.layers.get_output(self.l_out)        
        
        next_q_vals = lasagne.layers.get_output(self.next_l_out)
        
        max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True)
        
        T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)
        
        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1))

        diff = target - q_val

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        for conv_param in self.l_outs_conv:
            for p in lasagne.layers.helper.get_all_params(conv_param):
                params.append(p)
        
            
        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
        if update_rule == 'deepmind_rmsprop':
            grads = get_or_compute_grads(loss, params)
            updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, thelr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
Ejemplo n.º 57
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(CACLA2, self).__init__(model, n_in, n_out, state_bounds,
                                     action_bounds, reward_bound, settings_)

        # create a small convolutional neural network

        self._Fallen = T.icol("Action")
        self._Fallen.tag.test_value = np.zeros((self._batch_size, 1),
                                               dtype=np.dtype('int32'))

        self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        self._target_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='float64'),
                                            broadcastable=(False, True))

        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]
        # primary network
        self._model = model
        # Target network
        # self._modelTarget = copy.deepcopy(model)

        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsA_drop = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)
        self._q_valsTargetNextState = lasagne.layers.get_output(
            self._model.getCriticNetwork(),
            self._model.getResultStateSymbolicVariable())
        # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable())
        # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        # self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable())
        self._q_valsActA_drop = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=False)

        self._q_func = self._q_valsA
        # self._q_funcTarget = self._q_valsTarget
        self._q_func_drop = self._q_valsA_drop
        # self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        self._q_funcAct_drop = self._q_valsActA_drop
        # self._q_funcAct = theano.function(inputs=[self._model.getStateSymbolicVariable()], outputs=self._q_valsActA, allow_input_downcast=True)

        # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.maximum(1.0, theano.tensor.ceil(self._model.getRewardSymbolicVariable())) # Did not understand how the maximum was working
        # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.ceil(self._model.getRewardSymbolicVariable())
        self._target = (self._model.getRewardSymbolicVariable() +
                        (self._discount_factor *
                         self._q_valsTargetNextState)) * self._Fallen
        # self._target = self._model.getTargetSymbolicVariable()
        self._diff = self._target_shared - self._q_func
        self._diff_drop = self._target_shared - self._q_func_drop
        loss = 0.5 * self._diff**2
        self._loss = T.mean(loss)
        self._loss_drop = T.mean(0.5 * self._diff_drop**2)

        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._Fallen: self._fallen_shared
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._next_states_shared,
            # self._model.getRewardSymbolicVariable(): self._rewards_shared,
            self._model.getActionSymbolicVariable():
            self._model.getActions()
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))
        self._actor_regularization = (
            self._regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getActorNetwork(), lasagne.regularization.l2))
        # SGD update
        self._updates_ = lasagne.updates.rmsprop(
            self._loss +
            (self._regularization_weight *
             lasagne.regularization.regularize_network_params(
                 self._model.getCriticNetwork(), lasagne.regularization.l2)),
            self._params, self._learning_rate, self._rho, self._rms_epsilon)
        # TD update
        # self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params,
        #             self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon)

        # actDiff1 = (self._model.getActionSymbolicVariable() - self._q_valsActTarget) #TODO is this correct?
        # actDiff = (actDiff1 - (self._model.getActionSymbolicVariable() - self._q_valsActA))
        self._actDiff = (
            (self._model.getActionSymbolicVariable() - self._q_valsActA)
        )  # Target network does not work well here?
        self._actDiff_drop = (
            (self._model.getActionSymbolicVariable() - self._q_valsActA_drop)
        )  # Target network does not work well here?
        self._actLoss = 0.5 * self._actDiff**2
        self._actLoss = T.sum(self._actLoss) / float(self._batch_size)
        self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop**2) /
                              float(self._batch_size)
                              )  # because the number of rows can shrink

        self._actionUpdates = lasagne.updates.rmsprop(
            self._actLoss + self._actor_regularization, self._actionParams,
            self._learning_rate, self._rho, self._rms_epsilon)

        # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) +
        #   (self._regularization_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams,
        #           self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon)
        self._givens_grad = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._actions_shared,
        }
        CACLA2.compile(self)
Ejemplo n.º 58
0
    def __init__(self,
                 input_size,
                 output_size,
                 build_network=simple_network2,
                 discount=0.99,
                 learningRate=0.001,
                 frozen_network_update_time=1000):

        print "Initializing new Q network"

        self.input_size = input_size
        self.output_size = output_size
        self.discount = discount
        self.learningRate = learningRate

        self.frozen_network_update_time = frozen_network_update_time
        self.frozen_timer = 0
        self.epoch = 0

        # logging variables
        self.log = {
            "batchMeanQValue": [],
            "batchMeanTargetQValue": [],
            "cost": [],
            'performance': [],
            'epoch': []
        }

        # symbolic inputs
        sym_state = T.tensor4('state')  #Batchsize, channels, X, Y
        sym_action = T.icol('action')
        sym_reward = T.col('reward')
        sym_isDone = T.bcol('isDone')
        sym_nextState = T.tensor4('nextState')

        # networks
        self.network = build_network(input_size, output_size)
        self.frozen_network = build_network(input_size, output_size)
        self.update_frozen_network()

        # forward pass
        print "Compiling forward passes"
        self.forward_pass = theano.function([sym_state],
                                            lasagne.layers.get_output(
                                                self.network,
                                                sym_state,
                                                deterministic=True))

        self.frozen_forward_pass = theano.function([sym_state],
                                                   lasagne.layers.get_output(
                                                       self.frozen_network,
                                                       sym_state,
                                                       deterministic=True))

        #clipped_reward = T.clip(sym_reward,-1,1)
        #cost function definition
        cost, error, q_action, q_target = self.build_cost_function(
            sym_state, sym_action, sym_reward, sym_isDone, sym_nextState)

        params = lasagne.layers.get_all_params(self.network, trainable=True)
        update_function = lasagne.updates.rmsprop(
            cost, params, learning_rate=self.learningRate)

        # training function
        print "Compiling training function"
        self._train = theano.function(
            [sym_state, sym_action, sym_reward, sym_isDone, sym_nextState],
            [cost, error, q_action, q_target],
            updates=update_function)