def get_loss_sarsa_function(self): #args self.states = T.matrix('state') self.actions = T.icol('action') self.next_states = T.matrix('next_state') self.next_actions = T.icol('next_action') self.rewards = T.col('reward') #q(s,a) actionmask = T.eq( T.arange(self.nactions).reshape((1, -1)), self.actions.reshape((-1, 1))).astype(theano.config.floatX) q_action = (get_output(self.network, self.states) * actionmask).sum(axis=1).reshape((-1, 1)) #q(s_next,a_next) next_actionmask = T.eq( T.arange(self.nactions).reshape((1, -1)), self.next_actions.reshape((-1, 1))).astype(theano.config.floatX) next_q_action = (get_output(self.network, self.next_states) * next_actionmask).sum(axis=1).reshape((-1, 1)) #loss = target - qvalue loss = (self.rewards + self.discount * next_q_action - q_action) #mse mse = 0.5 * loss**2 #sum loss return T.sum(mse)
def __init__(self, args): reward = T.col('r') action = T.icol('a') terminal = T.icol('t') discount = T.scalar('gamma') learningRate = T.scalar('lr') rho = T.scalar('rho') epsilon = T.scalar('eps') rng = np.random.RandomState(42) self.batchNb = args.batchSize #convLayers = [[(8,8),(4,4),64], # [(4,4),(2,2),128], # [(3,3),(1,1),256], # [(3,3),(1,1),512]] #fcl = [1024, 6] convLayers = [[(8,8),(4,4),64], [(4,4),(2,2),128], [(3,3),(1,1),256], [(3,3),(1,1),256]] fcl = [1024, args.actionNb] self.q1 = NetStruct(convLayers, fcl, (4,100,100), rng, args) self.q2 = NetStruct(convLayers, fcl, (4,100,100), rng, args) self.q2.setParams(self.q1) self.states = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32')) self.states2 = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32')) self.actions = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True)) self.rewards = theano.shared(np.zeros((args.batchSize,1), dtype='float32'), broadcastable=(False,True)) self.terminals = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True)) self.learningRate = theano.shared(np.array(args.learningRate, dtype='float32')) self.rho = theano.shared(np.array(args.rmsPropRho, dtype='float32')) self.epsilon = theano.shared(np.array(args.rmsPropEpsilon, dtype='float32')) self.discount = theano.shared(np.array(args.discountFactor, dtype='float32')) loss = self.QLoss(self.q1.output, self.q2.output, action, reward, terminal, discount) params = self.q1.getParams() updates = self.rmsProp(loss, params, rho, epsilon, learningRate) self.train_model = theano.function( [], loss, updates=updates, givens = { self.q1.input: self.states, self.q2.input: self.states2, action: self.actions, reward: self.rewards, terminal: self.terminals, discount: self.discount, learningRate: self.learningRate, rho: self.rho, epsilon: self.epsilon } )
def __init__(self, lenW, dimW, dimS): self.W = th.shared(np.random.randn(lenW, dimW)) self.Uw = th.shared(np.random.randn(dimW, dimS)) self.Us = th.shared(np.random.randn(dimS, dimS)) self.V = th.shared(np.random.randn(dimS, lenW)) self.S0 = th.shared(np.random.randn(dimS,)) self.idx = T.icol() self.w = self.W[self.idx].reshape((self.idx.shape[0], self.W.shape[1])) def recurrence(w, s): # import ipdb; ipdb.set_trace() s1 = T.nnet.sigmoid(T.dot(w, self.Uw)) s2 = T.nnet.sigmoid(T.dot(s, self.Us)) ss = s1 + s2 pp = T.dot(s, self.V) return [ss, pp] [self.S, self.PP], _ = th.scan(fn=recurrence, sequences=self.w, outputs_info=[self.S0, None], n_steps=self.w.shape[0]) self.P = T.nnet.softmax(self.PP) self.RP = self.P[T.arange(self.w.shape[0]), self.idx[:,0]] self.cost = -T.sum(T.log(self.RP)) self.params = [self.W, self.Uw, self.Us, self.V, self.S0] self.grads = T.grad(self.cost, self.params) self.lr = T.scalar() self.updates = map(lambda (param, grad): (param, param - self.lr * grad), zip(self.params, self.grads)) self.train_fn = th.function([self.idx, self.lr], [self.cost], updates=self.updates, allow_input_downcast=True) self.fprop = th.function([self.idx], [self.S, self.P, self.cost], allow_input_downcast=True)
def __init__(self, lenW, dimW, dimS): self.W = th.shared(np.random.randn(lenW, dimW)) self.Uw = th.shared(np.random.randn(dimW, dimS)) self.Us = th.shared(np.random.randn(dimS, dimS)) self.V = th.shared(np.random.randn(dimS, lenW)) self.S0 = th.shared(np.random.randn(dimS, )) self.idx = T.icol() self.w = self.W[self.idx].reshape((self.idx.shape[0], self.W.shape[1])) def recurrence(w, s): # import ipdb; ipdb.set_trace() s1 = T.nnet.sigmoid(T.dot(w, self.Uw)) s2 = T.nnet.sigmoid(T.dot(s, self.Us)) ss = s1 + s2 pp = T.dot(s, self.V) return [ss, pp] [self.S, self.PP], _ = th.scan(fn=recurrence, sequences=self.w, outputs_info=[self.S0, None], n_steps=self.w.shape[0]) self.P = T.nnet.softmax(self.PP) self.RP = self.P[T.arange(self.w.shape[0]), self.idx[:, 0]] self.cost = -T.sum(T.log(self.RP)) self.params = [self.W, self.Uw, self.Us, self.V, self.S0] self.grads = T.grad(self.cost, self.params) self.lr = T.scalar() self.updates = map( lambda (param, grad): (param, param - self.lr * grad), zip(self.params, self.grads)) self.train_fn = th.function([self.idx, self.lr], [self.cost], updates=self.updates, allow_input_downcast=True) self.fprop = th.function([self.idx], [self.S, self.P, self.cost], allow_input_downcast=True)
def __init__(self, state_shape, num_actions, epsilon=1.0, epsilon_min=0.1, epsilon_iter=100000, discount=0.99, lrate=1e-3, batch_size=100, q_update_iter=1000, capacity=50000): if not isinstance(state_shape, tuple): raise AssertionError('state_shape must be of type <tuple>.') elif len(state_shape) == 0: raise AssertionError('No state space dimensions provided.') elif num_actions == 0: raise ValueError('Number of actions must be > 0.') elif epsilon_min is not None: assert epsilon_min < epsilon, 'Epsilon(min) must be < epsilon(max).' elif capacity < batch_size: raise ValueError('Replay capacity must be > batch_size.') self.state_shape = state_shape self.num_actions = num_actions self.q_network = build_network(state_shape, num_actions) self.q_targets = build_network(state_shape, num_actions) self.epsilon = epsilon self.epsilon_max = epsilon # How greedy the policy is self.epsilon_min = epsilon_min self.epsilon_iter = float(epsilon_iter) self.discount = discount self.lr = lrate self.batch_size = batch_size # How many samples to draw from buffer self.q_update_iter = q_update_iter # Update the q_target every C iter self.step = 0 self.replay_buffer = ReplayBuffer(capacity, state_shape) # Build training and sampling functions s0_sym = nn.get_all_layers(self.q_network)[0].input_var s1_sym = nn.get_all_layers(self.q_targets)[0].input_var a_sym = T.icol('actions') #(n, 1) r_sym = T.col('rewards') t_sym = T.col('terminal_state') sym_vars = [s0_sym, a_sym, r_sym, s1_sym, t_sym] # Training phase uses non-deterministic mapping loss = T.sum(self._build_loss(*sym_vars, deterministic=False)) params = nn.get_all_params(self.q_network, trainable=True) updates = lasagne.updates.adam(loss, params, self.lr, beta1=0.9) self.train_fn = theano.function(sym_vars, loss, updates=updates) # Build function for sampling from DQN pred = nn.get_output(self.q_network, deterministic=True) self.pred_fn = theano.function([s0_sym], pred)
def __init__(self, **kwargs): # assign default values that must be present, or else the network will not work self.options = { "networktype": "CNN-BLSTM", "NUMBER_OF_CLASSES": 1, "N_L1": 200, "N_L2": 200, "DROPOUT_IN": 0., "DROPOUT_LSTM": 0.1, "DROPOUT_OUT": 0.5, "DENSELAYER_NODES": 100, "L2": 0.00, "early_stopping": 10, } # load user supplied options for k in kwargs.keys(): self.options[k] = kwargs[k] # define some variables self.options["BS_PR_SEQ"] = self.options[ "SEQ_SIZE"] # bases per sequence - actual sequence length self.options["FS"] = [ self.options["BS_PR_SEQ"] - (self.options["FILTER_SIZES"][i] / len(self.options["VOCAB"])) + 1 for i in range(len(self.options["FILTER_SIZES"])) ] self.options["ALL_F"] = sum(self.options["FS"]) self.options["NUMBER_OF_CONV_LAYERS"] = len( self.options["FILTER_SIZES"]) # temporary compatibility fix self.type = self.options["networktype"] self.VOCAB = self.options["VOCAB"] self.FS = self.options["FS"] self.ALL_F = self.options["ALL_F"] self.BS_PR_SEQ = self.options["BS_PR_SEQ"] #self.DROPOUT_LSTM = self.options["DROPOUT_LSTM"] self.GRAD_CLIP = self.options["GRAD_CLIP"] self.FILTER_SIZES = self.options["FILTER_SIZES"] ####################################################### # symbolic variables # ####################################################### # Theano defines its computations using symbolic variables. A symbolic variable # is a matrix, vector, 3D matrix and specifies the data type. # A symbolic value does not hold any data, like a matlab matrix or np.array # Note that mask is constructed with a broadcastable argument which specifies # that the mask can be broadcasted in the 3. dimension. self.sym_input = T.tensor3('inputs') self.sym_target = T.icol('targets') # finally, build the model layers self.build_model()
def __init__(self, input_width, input_height, output_dim, num_frames, batch_size): self.input_width = input_width self.input_height = input_height self.output_dim = output_dim self.num_frames = num_frames self.batch_size = batch_size self.gamma = 0.99 # discount factor self.rho = 0.99 self.lr = 0.00025 # learning rate self.momentum = 0.95 self.freeze_targets = True self.l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size) if self.freeze_targets: self.next_l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') # terminals = T.icol('terminals') self.states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False,True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True)) # self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True)) q_vals = self.l_out.get_output(states / 255.0) if self.freeze_targets: next_q_vals = self.next_l_out.get_output(next_states / 255.0) else: next_q_vals = self.l_out.get_output(next_states / 255.0) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = rewards + self.gamma * T.max(next_q_vals, axis=1, keepdims=True) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1,1)) loss = T.mean(diff ** 2) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, # terminals: self.terminals_shared } if self.momentum > 0: updates = rmsprop_nesterov(loss, params, self.lr, self.rho, self.momentum, 1e-2) else: updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, 1e-6) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={ states: self.states_shared })
def _create_network(self): logger.info("Building network ...") net, input_var = self._build_network() target_values = T.matrix('target_output') actions = T.icol('actions') # Create masks # mask = theano.shared(np.zeros((self.batch_size, self.num_actions)).astype(np.int32)) mask = T.zeros_like(target_values) mask = T.set_subtensor( mask[T.arange(self.batch_size), actions.reshape((-1, ))], 1) # feed-forward path network_output = lasagne.layers.get_output(net, input_var / 255.0) # Add regularization penalty loss = squared_error(network_output * mask, target_values).mean() if self.weight_decay > 0.0: loss += regularize_network_params(net, l2) * self.weight_decay # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(net, trainable=True) # Compute updates for training if self.clip_error: grads = theano.gradient.grad(loss, all_params) grads = [ lasagne.updates.norm_constraint(grad, self.clip_error, range(grad.ndim)) for grad in grads ] updates = self.optimizer(grads, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) else: updates = self.optimizer(loss, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) # Theano functions for training and computing cost logger.info("Compiling functions ...") train = theano.function([input_var, target_values, actions], [loss, network_output, target_values, mask], updates=updates) predict = theano.function([input_var], network_output) return net, train, predict
def _create_network(self): logger.info("Building network ...") net, input_var = self._build_network() target_values = T.matrix('target_output') actions = T.icol('actions') # Create masks # mask = theano.shared(np.zeros((self.batch_size, self.num_actions)).astype(np.int32)) mask = T.zeros_like(target_values) mask = T.set_subtensor(mask[T.arange(self.batch_size), actions.reshape((-1,))], 1) # feed-forward path network_output = lasagne.layers.get_output(net, input_var / 255.0) # Add regularization penalty loss = squared_error(network_output * mask, target_values).mean() if self.weight_decay > 0.0: loss += regularize_network_params(net, l2) * self.weight_decay # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(net, trainable=True) # Compute updates for training if self.clip_error: grads = theano.gradient.grad(loss, all_params) grads = [lasagne.updates.norm_constraint(grad, self.clip_error, range(grad.ndim)) for grad in grads] updates = self.optimizer(grads, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) else: updates = self.optimizer(loss, all_params, learning_rate=self.learning_rate, rho=self.decay_rate) # Theano functions for training and computing cost logger.info("Compiling functions ...") train = theano.function([input_var, target_values, actions], [loss, network_output, target_values, mask], updates=updates) predict = theano.function([input_var], network_output) return net, train, predict
def test_git_on_gip(hyper_params=None, rng_seed=1234): assert(not (hyper_params is None)) # Initialize a source of randomness rng = np.random.RandomState(rng_seed) sup_count = 100 # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False) Xtr_su = datasets[0][0].get_value(borrow=False) Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32) Xtr_un = datasets[1][0].get_value(borrow=False) Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32) # get the joint labeled and unlabeled data Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX) Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]) # get the labeled data Xtr_su = Xtr_su.astype(theano.config.floatX) Ytr_su = Ytr_su[:,np.newaxis] # get observations and labels for the validation set Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) Yva = Yva[:,np.newaxis] # numpy is dumb # get size information for the data un_samples = Xtr_un.shape[0] su_samples = Xtr_su.shape[0] va_samples = Xva.shape[0] # set up some symbolic variables for input/output Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') Yd = T.icol('Yd_base') # set some "shape" parameters for the networks data_dim = Xtr_un.shape[1] label_dim = 10 prior_1_dim = 50 prior_2_dim = 50 prior_sigma = 1.0 batch_size = 100 ################## # SETUP A GIPAIR # ################## gn1_params = {} gn1_config = [prior_1_dim, 600, 600, data_dim] gn1_params['mlp_config'] = gn1_config gn1_params['activation'] = softplus_actfun gn1_params['out_type'] = 'bernoulli' gn1_params['lam_l2a'] = 1e-3 gn1_params['vis_drop'] = 0.0 gn1_params['hid_drop'] = 0.0 gn1_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in1_params = {} shared_config = [data_dim, 600, 600] top_config = [shared_config[-1], prior_1_dim] in1_params['shared_config'] = shared_config in1_params['mu_config'] = top_config in1_params['sigma_config'] = top_config in1_params['activation'] = softplus_actfun in1_params['lam_l2a'] = 1e-3 in1_params['vis_drop'] = 0.0 in1_params['hid_drop'] = 0.0 in1_params['bias_noise'] = 0.1 in1_params['input_noise'] = 0.0 # Initialize the base networks for this GIPair IN1 = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in1_params, shared_param_dicts=None) GN1 = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn1_params, shared_param_dicts=None) # Initialize biases in IN and GN IN1.init_biases(0.0) GN1.init_biases(0.0) # Initialize the GIPair GIP = GIPair(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, g_net=GN1, i_net=IN1, \ data_dim=data_dim, prior_dim=prior_1_dim, \ params=None, shared_param_dicts=None) # Set cost weighting parameters GIP.set_lam_nll(1.0) GIP.set_lam_kld(1.0) GIP.set_lam_l2w(1e-4) ################## # SETUP A GITRIP # ################## # set parameters for the generator network gn2_params = {} gn2_config = [(prior_2_dim + label_dim), 300, prior_1_dim] gn2_params['mlp_config'] = gn2_config gn2_params['activation'] = softplus_actfun gn2_params['out_type'] = 'gaussian' gn2_params['lam_l2a'] = 1e-3 gn2_params['vis_drop'] = 0.0 gn2_params['hid_drop'] = 0.0 gn2_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in2_params = {} shared_config = [prior_1_dim, 300] top_config = [shared_config[-1], prior_2_dim] in2_params['shared_config'] = shared_config in2_params['mu_config'] = top_config in2_params['sigma_config'] = top_config in2_params['activation'] = softplus_actfun in2_params['lam_l2a'] = 1e-3 in2_params['vis_drop'] = 0.0 in2_params['hid_drop'] = 0.0 in2_params['bias_noise'] = 0.1 in2_params['input_noise'] = 0.0 # choose some parameters for the categorical inferencer pn2_params = {} pc0 = [prior_1_dim, 300, label_dim] pn2_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.0, 'bias_noise': 0.1, 'do_dropout': False} #sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} pn2_params['spawn_configs'] = [sc0] #[sc0, sc1] pn2_params['spawn_weights'] = [1.0] #[0.5, 0.5] # Set remaining params pn2_params['activation'] = softplus_actfun pn2_params['ear_type'] = 6 pn2_params['lam_l2a'] = 1e-3 pn2_params['vis_drop'] = 0.0 pn2_params['hid_drop'] = 0.0 # Initialize the base networks for this GITrip GN2 = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn2_params, shared_param_dicts=None) IN2 = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in2_params, shared_param_dicts=None) PN2 = PeaNet(rng=rng, Xd=Xd, params=pn2_params) # Initialize biases in GN, IN, and PN GN2.init_biases(0.0) IN2.init_biases(0.0) PN2.init_biases(0.0) # Initialize the GITrip GIT = GITrip(rng=rng, \ Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \ g_net=GN2, i_net=IN2, p_net=PN2, \ data_dim=prior_1_dim, prior_dim=prior_2_dim, \ label_dim=label_dim, batch_size=batch_size, \ params=None, shared_param_dicts=None) # Set cost weighting parameters GIT.set_lam_nll(1.0) GIT.set_lam_kld(1.0) GIT.set_lam_cat(0.0) GIT.set_lam_pea(0.0) GIT.set_lam_ent(0.0) GIT.set_lam_l2w(1e-4) ##################################################### # CONSTRUCT A GITonGIP STACKED, SEMI-SUPERVISED VAE # ##################################################### GOG = GITonGIP(rng=rng, \ Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \ gip_vae=GIP, git_vae=GIT, \ data_dim=data_dim, prior_1_dim=prior_1_dim, \ prior_2_dim=prior_2_dim, label_dim=label_dim, \ batch_size=batch_size, \ params=None, shared_param_dicts=None) ################################# # WRITE SOME INFO TO "LOG" FILE # ################################# learn_rate_git = hyper_params['learn_rate_git'] lam_pea_git = hyper_params['lam_pea_git'] lam_cat_git = hyper_params['lam_cat_git'] lam_ent_git = hyper_params['lam_ent_git'] lam_l2w_git = hyper_params['lam_l2w_git'] out_name = hyper_params['out_name'] out_file = open(out_name, 'wb') out_file.write("**TODO: More informative output, and maybe a real log**\n") out_file.write("learn_rate_git: {0:.4f}\n".format(learn_rate_git)) out_file.write("lam_pea_git: {0:.4f}\n".format(lam_pea_git)) out_file.write("lam_cat_git: {0:.4f}\n".format(lam_cat_git)) out_file.write("lam_ent_git: {0:.4f}\n".format(lam_ent_git)) out_file.write("lam_l2w_git: {0:.4f}\n".format(lam_l2w_git)) out_file.flush() ################################################## # TRAIN THE GIPair FOR SOME NUMBER OF ITERATIONS # ################################################## learn_rate = 0.002 for i in range(250000): if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.8 scale = min(1.0, (float(i+1) / 50000.0)) GIP.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIP.set_lam_nll(lam_nll=1.0) GIP.set_lam_kld(lam_kld=scale) # sample some unlabeled data to train with tr_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) Xd_batch = binarize_data(Xtr_un.take(tr_idx, axis=0)) Xc_batch = 0.0 * Xd_batch Xm_batch = 0.0 * Xd_batch # do a minibatch update of the model, and compute some costs outputs = GOG.train_gip(Xd_batch, Xc_batch, Xm_batch) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] other_reg_cost = 1.0 * outputs[3] if ((i % 1000) == 0): o_str = "batch: {0:d}, joint_cost: {1:.4f}, data_nll_cost: {2:.4f}, post_kld_cost: {3:.4f}, other_reg_cost: {4:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, other_reg_cost) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): file_name = "GOG_GIP_SAMPLES_b{0:d}.png".format(i) Xd_samps = np.repeat(Xd_batch[0:10,:], 3, axis=0) sample_lists = GIP.sample_gil_from_data(Xd_samps, loop_iters=10) Xs = np.vstack(sample_lists["data samples"]) utils.visualize_samples(Xs, file_name) ######################################################## # REMOVE (SORT OF) UNUSED DIMENSIONS FROM LATENT SPACE # ######################################################## #tr_idx = npr.randint(low=0,high=un_samples,size=(10000,)) #Xd_batch = binarize_data(Xtr_un.take(tr_idx, axis=0)) #Xp_batch = GIP.IN.mean_posterior(Xd_batch, 0.0*Xd_batch, 0.0*Xd_batch) #Xp_std = np.std(Xp_batch, axis=0, keepdims=True) #dim_mask = 1.0 * (Xp_std > 0.1) #GIT.set_input_mask(dim_mask) #print("MASK NNZ: {0:.4f}".format(np.sum(dim_mask))) ################################################## # TRAIN THE GITrip FOR SOME NUMBER OF ITERATIONS # ################################################## GIT.set_lam_l2w(lam_l2w=lam_l2w_git) learn_rate = learn_rate_git GIT.set_all_sgd_params(learn_rate=learn_rate, momentum=0.98) for i in range(250000): scale = 1.0 if (i < 25000): scale = float(i+1) / 25000.0 if ((i+1 % 50000) == 0): learn_rate = learn_rate * 0.8 # do a minibatch update using unlabeled data if True: # get some data to train with un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) Xd_un = binarize_data(Xtr_un.take(un_idx, axis=0)) Yd_un = Ytr_un.take(un_idx, axis=0) Xc_un = 0.0 * Xd_un Xm_un = 0.0 * Xd_un # do a minibatch update of the model, and compute some costs GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIT.set_lam_nll(1.0) GIT.set_lam_kld(scale * 1.0) GIT.set_lam_cat(0.0) GIT.set_lam_pea(scale * lam_pea_git) GIT.set_lam_ent(scale * lam_ent_git) outputs = GOG.train_git(Xd_un, Xc_un, Xm_un, Yd_un) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_cost = 1.0 * outputs[4] post_ent_cost = 1.0 * outputs[5] other_reg_cost = 1.0 * outputs[6] if True: # get some data to train with su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) Xd_su = binarize_data(Xtr_su.take(su_idx, axis=0)) Yd_su = Ytr_su.take(su_idx, axis=0) Xc_su = 0.0 * Xd_su Xm_su = 0.0 * Xd_su # update only based on the label-based classification cost GIT.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIT.set_lam_nll(0.0) GIT.set_lam_kld(0.0) GIT.set_lam_cat(scale * lam_cat_git) GIT.set_lam_pea(scale * lam_pea_git) GIT.set_lam_ent(0.0) outputs = GOG.train_git(Xd_su, Xc_su, Xm_su, Yd_su) joint_2 = 1.0 * outputs[0] data_nll_2 = 1.0 * outputs[1] post_kld_2 = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_2 = 1.0 * outputs[4] post_ent_2 = 1.0 * outputs[5] other_reg_cost = 1.0 * outputs[6] if ((i % 500) == 0): o_str = "batch: {0:d}, joint_cost: {1:.4f}, nll: {2:.4f}, kld: {3:.4f}, cat: {4:.4f}, pea: {5:.4f}, ent: {6:.4f}, other_reg: {7:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, post_cat_cost, post_pea_cost, post_ent_cost, other_reg_cost) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 2500) == 0): # check classification error on training and validation set train_err = GOG.classification_error(Xtr_su, Ytr_su) va_err = GOG.classification_error(Xva, Yva) o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): file_name = "GoG_GIT_SAMPLES_b{0:d}.png".format(i) va_idx = npr.randint(low=0,high=va_samples,size=(5,)) Xd_samps = np.vstack([Xd_un[0:5,:], binarize_data(Xva[va_idx,:])]) Xd_samps = np.repeat(Xd_samps, 3, axis=0) sample_lists = GOG.sample_git_from_data(Xd_samps, loop_iters=10) Xs = np.vstack(sample_lists["data samples"]) Ys = GOG.class_probs(Xs) Xs = mnist_prob_embed(Xs, Ys) utils.visualize_samples(Xs, file_name)
def __init__(self, input, n_in, n_out): """ Initialize the parameters of the logistic regression :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # self._w = init_weights((n_in, n_out)) # self._w_old = init_weights((n_in, n_out)) self._w = init_tanh(n_in, n_out, 1234) self._w_old = init_tanh(n_in, n_out, 2235) print "Initial W " + str(self._w.get_value()) # (n_out,) ,) used so that it can be added as row or column self._b = init_b_weights((n_out, )) self._b_old = init_b_weights((n_out, )) # learning rate for gradient descent updates. self._learning_rate = 0.005 # future discount self._discount_factor = 0.8 self._weight_update_steps = 5000 self._updates = 0 # data types for model State = T.fmatrix("State") ResultState = T.fmatrix("ResultState") Reward = T.col("Reward") Action = T.icol("Action") # Q_val = T.fmatrix() model = T.tanh(T.dot(State, self._w) + self._b) self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True) q_val = self.model(State, self._w, self._b) action_pred = T.argmax(q_val, axis=1) # bellman error, delta error delta = ((Reward + (self._discount_factor * T.max(self.model(ResultState, self._w_old, self._b_old), axis=1, keepdims=True))) - (self.model(State, self._w, self._b))[Action]) # delta = ((Reward + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - T.max(self.model(State), axis=1, keepdims=True)) self._L2_reg = 0.01 # L2 norm ; one regularization option is to enforce # L2 norm to be small self._L2 = ((self._w**2).sum()) # total bellman cost # Squaring is important so errors do not cancel each other out. # mean is used instead of sum as it is more independent of parameter scale bellman_cost = T.mean(0.5 * ((delta)**2)) + (self._L2 * self._L2_reg) # Compute gradients w.r.t. model parameters gradient = T.grad(cost=bellman_cost, wrt=self._w) gradient_b = T.grad(cost=bellman_cost, wrt=self._b) """ Updates to apply to parameters Performing gradient descent, want to add steps in the negative direction of gradient. """ update = [[self._w, self._w + (-gradient * self._learning_rate)], [self._b, self._b + (-gradient_b * self._learning_rate)]] # This function performs one training step and update self._train = theano.function( inputs=[State, Action, Reward, ResultState], outputs=bellman_cost, updates=update, allow_input_downcast=True) # Used to get to predicted actions to select self._predict = theano.function(inputs=[State], outputs=action_pred, allow_input_downcast=True) self._q_values = theano.function(inputs=[State], outputs=q_val, allow_input_downcast=True) self._bellman_error = theano.function( inputs=[State, Action, Reward, ResultState], outputs=delta, allow_input_downcast=True)
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, randomState, DoubleQ=False, TheQNet=NN): """ Initialize environment """ QNetwork.__init__(self,environment, batch_size) self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self._DoubleQ = DoubleQ self._randomState = randomState QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState) self.update_counter = 0 states=[] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states=[] # idem than states at t+1 self.states_shared=[] # list of shared variable for each of the k element in the belief state self.next_states_shared=[] # idem that self.states_shared at t+1 for i, dim in enumerate(self._input_dimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append( T.matrix("%s_%s" % ("state", i)) ) next_states.append( T.matrix("%s_%s" % ("next_state", i)) ) self.states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False)) self.next_states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False)) print("Number of observations per state: {}".format(len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._input_dimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState) self.q_vals, self.params, shape_after_conv = QNet._buildDQN(states) print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv)) self.next_q_vals, self.next_params, shape_after_conv = QNet._buildDQN(next_states) self._resetQHat() self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) if(self._DoubleQ==True): givens_next={} for i, x in enumerate(self.next_states_shared): givens_next[ states[i] ] = x self.next_q_vals_current_qnet=theano.function([], self.q_vals, givens=givens_next) next_q_curr_qnet = theano.clone(self.next_q_vals) argmax_next_q_vals=T.argmax(next_q_curr_qnet, axis=1, keepdims=True) max_next_q_vals=self.next_q_vals[T.arange(batch_size),argmax_next_q_vals.reshape((-1,))].reshape((-1, 1)) else: max_next_q_vals=T.max(self.next_q_vals, axis=1, keepdims=True) T_ones_like=T.ones_like(T.ones_like(terminals) - terminals) target = rewards + T_ones_like * thediscount * max_next_q_vals q_val=self.q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32... diff = - q_val + target if self.clip_delta > 0: # This loss function implementation is taken from # https://github.com/spragunr/deep_q_rl # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss_ind = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss_ind = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss_ind) elif batch_accumulator == 'mean': loss = T.mean(loss_ind) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[ states[i] ] = x for i, x in enumerate(self.next_states_shared): givens[ next_states[i] ] = x gparams=[] for p in self.params: gparam = T.grad(loss, p) gparams.append(gparam) updates = [] if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': for i,(p, g) in enumerate(zip(self.params, gparams)): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - self.rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + self.rms_epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - thelr * g)) elif update_rule == 'sgd': for i, (param, gparam) in enumerate(zip(self.params, gparams)): updates.append((param, param - thelr * gparam)) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if(self._DoubleQ==True): self._train = theano.function([thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') else: self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2={} for i, x in enumerate(self.states_shared): givens2[ states[i] ] = x self._q_vals = theano.function([], self.q_vals, givens=givens2, on_unused_input='warn')
def setup(self): lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_q_network() states = T.tensor3('states') next_states = T.tensor3('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Shared variables for training from a minibatch of replayed # state transitions, each consisting of an observation, # along with the chosen action and resulting # reward and terminal status. self.states_shared = theano.shared( np.zeros((self.batch_size, self.input_height, self.input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate q_vals. self.state_shared = theano.shared( np.zeros((self.input_height, self.input_width), dtype=theano.config.floatX)) # Formulas q_vals = lasagne.layers.get_output(self.l_out, states / self.input_scale) next_q_vals = lasagne.layers.get_output(self.l_out, next_states / self.input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) action_mask = T.eq(T.arange(self.num_actions).reshape((1, -1)), actions.reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * action_mask).sum(axis=1).reshape((-1, 1)) diff = target - output loss = 0.5 * diff ** 2 loss = T.sum(loss) #loss = T.mean(loss) # Params and givens params = lasagne.layers.helper.get_all_params(self.l_out) updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) train_givens = { states: self.states_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, input_scale=255.0, reward_bias=0.): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + reward_bias + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff ** 2) elif batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, batch_size, input_dim, num_frames, action_dim, discount, lr_policy, lr_Q_val_f, memory_capability, defrozen_number, cliff_delta=0): self.input_dim = input_dim self.num_frames = num_frames self.action_dim = action_dim self.batch_size = batch_size self.discount = discount self.lr = learning_rate self.policy_out, _ = self.build_policy() self.Q_val_f_out, l_in = self.build_Q_function() self.state_mem= np.zeros((memory_capability, num_frames, input_dim), dtype=theano.config.floatX) self.action_mem= np.zeros((memory_capability, action_dim), dtype=theano.config.floatX) self.reward_mem= np.zeros((memory_capability, 1), dtype='int32') self.next_states_mem=np.zeros((memory_capability, num_frames, input_dim), dtype=theano.config.floatX) self.curr_idx=0 self.train_flag = False self.mem_full=False self.defrozen_number = defrozen_number self.cliff_delta= cliff_delta self.target_q_val_f = build_policy() self.target_policy = build_Q_function() states = T.tensor3('states') next_states = T.tensor3('states') rewards = T.col('rewards') action = T.fmatrix('action') next_action = T.fmatrix('next_action') terminals = T.icol('terminals') lasagne.random.set_rng(self.rng) self.input_shared = theano.shared( np.zeros((batch_size, num_frames, input_dim), dtype=theano.config.floatX) ) self.rewards_shared=theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True) ) self.action_shared=theano.shared( np.zeros((batch_size, action_dim), dtype=theano.config.floatX), broadcastable=(False, True) ) self.terminals_shared=theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True) ) self.states_shared=theano.shared( np.zeros((batch_size ,num_frames, input_dim), dtype=theano.config.floatX) ) self.next_state_shared=theano.shared( np.zeros((batch_size, num_frames, input_dim), dtype=theano.config.floatX) ) self.next_action_shared=theano.shared( np.zeros(batch_size, 1), theano.config.floatX ) policy_action = lasagne.layers.get_output(self.policy_out, states) target_policy_action = lasagne.layers.get_output(self.target_policy, states) q_vals = lasagne.layers.get_output(self.Q_val_f_out, { l_in[0]: states, l_in[2]: action }) target_q_val = lasagne.layers.get_output( self.target_q_val_f, { l_in[0]: next_states, l_in[2]: next_action }) terminalsX=terminals.astype(theano.config.floaX) yi = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * next_q_vals) diff = q_vals - yi if self.cliff_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.cliff_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.cliff_delta * linear_part else: loss = 0.5 * diff ** 2 loss = T.mean(loss) train_Q_params = lasagne.layer.get_all_params(self.Q_val_f_out) train_Q_givens={ states: self.states_shared, rewards: self.rewards_shared, action: self.action_shared, terminals: self.terminals_shared, } Q_updates = lasagne.updates.adam(loss, train_Q_params, self.lr_Q_val_f) self._train_Q = thenao.function([], [loss], updates, givens=train_Q_givens) train_policy_params = lasagne.layers.get_all_params(self.policy_out) d_train_policy_params = theano.gradient.grad() policy_updates = lasagne.updates.adam() self._q_vals = theano.function([], q_vals)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network( network_type, input_width, input_height, num_actions, num_frames, batch_size ) if self.freeze_interval > 0: self.next_l_out = self.build_network( network_type, input_width, input_height, num_actions, num_frames, batch_size ) self.reset_q_hat( ) states, next_states = T.tensor4( 'states' ), T.tensor4( 'next_states' ) actions, rewards = T.icol( 'actions' ), T.col( 'rewards' ) terminals = T.icol( 'terminals' ) self.states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ), dtype = theano.config.floatX ) ) self.next_states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ), dtype = theano.config.floatX ) ) self.rewards_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = theano.config.floatX ), broadcastable = ( False, True ) ) self.actions_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ), broadcastable = ( False, True ) ) self.terminals_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ), broadcastable = ( False, True ) ) ## Get learned Q-values q_vals_test = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = True ) # q_vals_test = theano.gradient.disconnected_grad( q_vals_test ) q_vals_train = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = False ) if self.freeze_interval > 0: target_q_vals = lasagne.layers.get_output( self.next_l_out, next_states / input_scale, deterministic = True) else: target_q_vals = lasagne.layers.get_output( self.l_out, next_states / input_scale, deterministic = True) target_q_vals = theano.gradient.disconnected_grad( target_q_vals ) ## The traget depends on the received rewards and the discounted future ## reward stream for the given action in the current state. target = ( rewards + ( T.ones_like( terminals ) - terminals ) * self.discount * T.max( target_q_vals, axis = 1, keepdims = True ) ) ## target - b x 1, where b is batch size. ## q_vals - b x A, where A is the number of outputs of the Q-net ## Theano differentiates indexed (and reduced) arrays in a clever manner: ## it sets all left out gradients to zero. THIS IS CORRECT! ## \nabla_\theta diff = - 1_{a = a_j} \nabla Q( s, a_j, \theta) \,. diff = target - q_vals_train[ T.arange( batch_size ), actions.reshape( ( -1, ) ) ].reshape( ( -1, 1 ) ) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], loss, updates=updates, givens=givens) self._q_vals = theano.function([], q_vals_test, givens={states: self.states_shared})
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.callback = None lasagne.random.set_rng(self.rng) #set the seed self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) # 4-dimensional ndarray (similar to prestates in memory_store) states = T.tensor4('states') # 4-dimensional ndarray (similar to poststates in memory_store) next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # creating a shared object is like declaring global - it has be shared between functions that it appears in. # similar to prestates matrix construction in memory_store self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # compute an expression for the output of a single layer given its input # scaling turns grayscale (or) black and white to 1s and 0s (black OR white) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) #perform this step: Q(st,a) = rimm + gamma*[ max(a{t+1}) Q(s{t+1}, a{t+1})] # col. of ones with same dim. as terminals target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) # col. matrix into row matrix|row. matrix into col matrix diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # basically, we need to choose that 'a' (action) which maximizes Q(s,a) if self.clip_delta > 0: quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 loss = T.mean(loss) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': # param := param - learning_rate * gradient updates = lasagne.updates.sgd(loss, params, self.lr) else: print "Unrecognized update rule" sys.exit(1) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) # inputs,outputs self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate,momentum, batch_size, ): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.lr = learning_rate self.momentum = momentum self.update_counter = 0 states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) from nn import network n, layers = network(n_channels=num_frames, img_size=input_width, n_actions=num_actions) self.n = n q_vals = n.output(data_layer=states) next_q_vals = n.output(data_layer=next_states) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) next_q_vals = T.minimum(0, next_q_vals) layers_samples = [l.output(data_layer=states) for l in layers] layers_batchstd = [T.mean(T.std(s, axis=0)) for s in layers_samples] w, b = n.weight(), n.bias() params = w + b target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) loss = T.mean(diff ** 2) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } updates = lasagne.updates.rmsprop(loss, params, self.lr) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._batchstd = theano.function([], layers_batchstd, givens={states: self.states_shared}) self._sample = theano.function([], layers_samples, givens={states: self.states_shared}) self._q_vals = theano.function([states], q_vals,)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, eta, params_share=True, double_learning=False, annealing=False, temp=1.0, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.eta = eta self.params_share = params_share self.double_learning = double_learning self.annealing = annealing self.temp0 = temp lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out, self.l_feature, self.l_init = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out, self.next_l_feature, self.next_l_init = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat_share() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') exp_temp = T.scalar('exploration tuning') # Shared variables for training from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) images, along with the chosen action and resulting # reward and terminal status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames + 1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.exp_temp_shared = theano.shared(np.float32(self.temp0)) # default without annealing # Shared variable for a single state, to calculate q_vals. self.state_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) feature_vals = lasagne.layers.get_output(self.l_feature, states / input_scale) q_params = lasagne.layers.get_all_params(self.l_out) q_params_vals = lasagne.layers.get_all_param_values(self.l_out) if self.params_share: w_pi = q_params[-2] b_pi = q_params[-1] else: params_init = lasagne.layers.get_all_param_values(self.l_init) w_pi = theano.shared(params_init[-2]) b_pi = theano.shared(params_init[-1]) pi_vals = T.nnet.softmax(exp_temp * (T.dot(feature_vals, w_pi) + b_pi)) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) if self.double_learning: next_feature_vals = lasagne.layers.get_output(self.l_feature, next_states / input_scale) next_q_params = lasagne.layers.get_all_params(self.l_out) next_q_params_vals = lasagne.layers.get_all_param_values(self.l_out) if self.params_share: next_w_pi = next_q_params[-2] next_b_pi = next_q_params[-1] else: next_params_init = lasagne.layers.get_all_param_values(self.l_init) next_w_pi = theano.shared(next_params_init[-2]) next_b_pi = theano.shared(next_params_init[-1]) next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi)) next_pi_vals = theano.gradient.disconnected_grad(next_pi_vals) else: next_feature_vals = lasagne.layers.get_output(self.next_l_feature, next_states / input_scale) next_q_params = lasagne.layers.get_all_params(self.next_l_out) next_q_params_vals = lasagne.layers.get_all_param_values(self.next_l_out) if self.params_share: next_w_pi = next_q_params[-2] next_b_pi = next_q_params[-1] else: next_params_init = lasagne.layers.get_all_param_values(self.next_l_init) next_w_pi = theano.shared(next_params_init[-2]) next_b_pi = theano.shared(next_params_init[-1]) next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi)) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), actions.reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.sum(next_q_vals * next_pi_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) if self.params_share: params = lasagne.layers.helper.get_all_params(self.l_out) else: params = lasagne.layers.helper.get_all_params(self.l_out) params.append(next_w_pi) params.append(next_b_pi) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared, exp_temp: self.exp_temp_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.num_frames, self.input_height, self.input_width)) } pi_givens = { states: self.state_shared.reshape((1, self.num_frames, self.input_height, self.input_width)), exp_temp: self.exp_temp_shared } self._q_vals = theano.function([], q_vals[0], givens=q_givens) self._pi_vals = theano.function([], pi_vals[0], givens=pi_givens) grad_fc_w = T.grad(loss, self.l_out.W) self._grad = theano.function([], outputs=grad_fc_w, givens=train_givens)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, use_double, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.use_double = use_double self.rng = rng # Using Double DQN is pointless without periodic freezing if self.use_double: assert self.freeze_interval > 0 # pass lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: # Nature. If using periodic freezing next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: # NIPS next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) if self.use_double: maxaction = T.argmax(q_vals, axis=1, keepdims=False) temptargets = next_q_vals[T.arange(batch_size),maxaction].reshape((-1, 1)) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * temptargets) else: target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) def inspect_inputs(i, node, fn): if ('maxand' not in str(node).lower() and '12345' not in str(node)): return print i, node, "input(s) value(s):", [input[0] for input in fn.inputs], raw_input('press enter') def inspect_outputs(i, node, fn): if ('maxand' not in str(node).lower() and '12345' not in str(node)): return if '12345' in str(node): print "output(s) value(s):", [np.asarray(output[0]) for output in fn.outputs] else: print "output(s) value(s):", [output[0] for output in fn.outputs] raw_input('press enter') if False: self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens, mode=theano.compile.MonitorMode( pre_func=inspect_inputs, post_func=inspect_outputs)) theano.printing.debugprint(target) else: self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) if False: self._q_vals = theano.function([], q_vals, givens={states: self.states_shared}, mode=theano.compile.MonitorMode( pre_func=inspect_inputs, post_func=inspect_outputs)) else: self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
rewards_shared = theano.shared(np.zeros((mini_batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) actions_shared = theano.shared(np.zeros((mini_batch_size, 1), dtype='int32'), broadcastable=(False, True)) terminals_shared = theano.shared(np.zeros((mini_batch_size, 1), dtype='int32'), broadcastable=(False, True)) # 4-dimensional ndarray (similar to prestates in memory_store) states = T.tensor4('states') # 4-dimensional ndarray (similar to poststates in memory_store) post_states = T.tensor4('post_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') givens = { states: states_shared, post_states: post_states_shared, rewards: rewards_shared, actions: actions_shared, terminals: terminals_shared } def build_net(): """Build deeep q network, exactly as described in the deep mind paper """ input_layer = lasagne.layers.InputLayer(shape=(mini_batch_size, history_length,
def __init__(self, input_width, input_height, n_actions, discount, learn_rate, batch_size, rng): self.input_width = input_width self.input_height = input_height self.n_actions = n_actions self.discount = discount self.lr = learn_rate self.batch_size = batch_size self.rng = rng lasagne.random.set_rng(self.rng) self.l_out = self.build_network(batch_size, input_width, input_height, n_actions) states = t.tensor4("states") next_states = t.tensor4("next_states") rewards = t.col("rewards") actions = t.icol("actions") terminals = t.icol("terminals") self.states_shared = theano.shared( np.zeros((batch_size, 1, input_height, input_width), dtype=theano.config.floatX) ) self.next_states_shared = theano.shared( np.zeros((batch_size, 1, input_height, input_width), dtype=theano.config.floatX) ) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True) ) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype="int32"), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype="int32"), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states) next_q_vals = lasagne.layers.get_output(self.l_out, next_states) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = rewards + (t.ones_like(terminals) - terminals) * self.discount * t.max( next_q_vals, axis=1, keepdims=True ) diff = target - q_vals[t.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) loss = t.sum(0.5 * diff ** 2) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared, } updates = lasagne.updates.sgd(loss, params, self.lr) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength, networkType="conv", updateRule="deepmind_rmsprop", batchAccumulator="sum", clipDelta=1.0, inputScale=255.0): self.batchSize = batchSize self.numFrames = numFrames self.inputWidth = inputWidth self.inputHeight = inputHeight self.inputScale = inputScale self.numActions = numActions self.discountRate = discountRate self.learningRate = learningRate self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.networkUpdateDelay = networkUpdateDelay self.useSARSAUpdate = useSARSAUpdate self.kReturnLength = kReturnLength self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.clipDelta = clipDelta self.updateCounter = 0 states = T.tensor4("states") nextStates = T.tensor4("nextStates") rewards = T.col("rewards") actions = T.icol("actions") nextActions = T.icol("nextActions") terminals = T.icol("terminals") self.statesShared = theano.shared( np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.nextStatesShared = theano.shared( np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.rewardsShared = theano.shared(np.zeros( (self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.qValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale) if self.networkUpdateDelay > 0: self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) self.resetNextQValueNetwork() nextQValues = lasagne.layers.get_output( self.nextQValueNetwork, nextStates / self.inputScale) else: nextQValues = lasagne.layers.get_output( self.qValueNetwork, nextStates / self.inputScale) nextQValues = theano.gradient.disconnected_grad(nextQValues) if self.useSARSAUpdate: target = rewards + terminals * ( self.discountRate** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape( (-1, ))].reshape((-1, 1)) else: target = rewards + terminals * ( self.discountRate**self.kReturnLength) * T.max( nextQValues, axis=1, keepdims=True) targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1, ))].reshape( (-1, 1)) quadraticPart = T.minimum(abs(targetDifference), self.clipDelta) linearPart = abs(targetDifference) - quadraticPart # if self.clipDelta > 0: # targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta) if self.batchAccumulator == "sum": # loss = T.sum(targetDifference ** 2) loss = T.sum(0.5 * quadraticPart**2 + self.clipDelta * linearPart) elif self.batchAccumulator == "mean": # loss = T.mean(targetDifference ** 2) loss = T.mean(0.5 * quadraticPart**2 + self.clipDelta * linearPart) else: raise ValueError("Bad Network Accumulator. {sum, mean} expected") networkParameters = lasagne.layers.helper.get_all_params( self.qValueNetwork) if self.updateRule == "deepmind_rmsprop": updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "rmsprop": updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "sgd": updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate) else: raise ValueError( "Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected") if self.momentum > 0: updates.lasagne.updates.apply_momentum(updates, None, self.momentum) lossGivens = { states: self.statesShared, nextStates: self.nextStatesShared, rewards: self.rewardsShared, actions: self.actionsShared, nextActions: self.nextActionsShared, terminals: self.terminalsShared } self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn') self.__computeQValues = theano.function( [], qValues, givens={states: self.statesShared})
def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound): super(DeepRLNet3, self).__init__(n_in, n_out, state_bounds, action_bounds, reward_bound) batch_size = 32 # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, self._state_length) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, self._state_length) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.icol("Action") Action.tag.test_value = np.zeros((batch_size, 1), dtype=np.dtype('int64')) # create a small convolutional neural network inputLayerA = lasagne.layers.InputLayer((None, self._state_length), State) l_hid1A = lasagne.layers.DenseLayer( inputLayerA, num_units=256, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2A = lasagne.layers.DenseLayer( l_hid1A, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3A = lasagne.layers.DenseLayer( l_hid2A, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outA = lasagne.layers.DenseLayer( l_hid3A, num_units=n_out, nonlinearity=lasagne.nonlinearities.linear) # self._b_o = init_b_weights((n_out,)) # self.updateTargetModel() inputLayerB = lasagne.layers.InputLayer((None, self._state_length), State) l_hid1B = lasagne.layers.DenseLayer( inputLayerB, num_units=256, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid2B = lasagne.layers.DenseLayer( l_hid1B, num_units=128, nonlinearity=lasagne.nonlinearities.leaky_rectify) l_hid3B = lasagne.layers.DenseLayer( l_hid2B, num_units=64, nonlinearity=lasagne.nonlinearities.leaky_rectify) self._l_outB = lasagne.layers.DenseLayer( l_hid3B, num_units=n_out, nonlinearity=lasagne.nonlinearities.linear) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = 0.0002 self._discount_factor = 0.8 self._rho = 0.95 self._rms_epsilon = 0.001 self._weight_update_steps = 8000 self._updates = 0 self._states_shared = theano.shared( np.zeros((batch_size, self._state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((batch_size, self._state_length), dtype=theano.config.floatX)) self._rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int64'), broadcastable=(False, True)) self._q_valsA = lasagne.layers.get_output(self._l_outA, State) self._q_valsB = lasagne.layers.get_output(self._l_outB, ResultState) self._q_func = self._q_valsA[T.arange(batch_size), Action.reshape((-1, ))].reshape((-1, 1)) target = ( Reward + #(T.ones_like(terminals) - terminals) * self._discount_factor * T.max(self._q_valsB, axis=1, keepdims=True)) diff = target - self._q_valsA[ T.arange(batch_size), Action.reshape((-1, ))].reshape( (-1, 1)) # Does some fancy indexing to get the column of interest loss = 0.5 * diff**2 + ( 1e-6 * lasagne.regularization.regularize_network_params( self._l_outA, lasagne.regularization.l2)) loss = T.mean(loss) params = lasagne.layers.helper.get_all_params(self._l_outA) givens = { State: self._states_shared, ResultState: self._next_states_shared, Reward: self._rewards_shared, Action: self._actions_shared, } # SGD update updates = lasagne.updates.rmsprop(loss, params, self._learning_rate, self._rho, self._rms_epsilon) # TD update # updates = lasagne.updates.rmsprop(T.mean(self._q_func) + (1e-5 * lasagne.regularization.regularize_network_params( # self._l_outA, lasagne.regularization.l2)), params, # self._learning_rate * -T.mean(diff), self._rho, self._rms_epsilon) self._train = theano.function([], [loss, self._q_valsA], updates=updates, givens=givens) self._q_vals = theano.function([], self._q_valsA, givens={State: self._states_shared}) self._bellman_error = theano.function( inputs=[State, Action, Reward, ResultState], outputs=diff, allow_input_downcast=True)
def __init__(self, input, n_in, n_out): hidden_size = 36 batch_size = 32 self._w_h = init_weights((n_in, hidden_size)) self._b_h = init_b_weights((1, hidden_size)) # self._b_h = init_b_weights((hidden_size,)) self._w_h2 = init_weights((hidden_size, hidden_size)) self._b_h2 = init_b_weights((1, hidden_size)) # self._b_h2 = init_b_weights((hidden_size,)) # self._w_o = init_tanh(hidden_size, n_out) self._w_o = init_weights((hidden_size, n_out)) self._b_o = init_b_weights((1, n_out)) # self._b_o = init_b_weights((n_out,)) self.updateTargetModel() self._w_h_old = init_weights((n_in, hidden_size)) self._w_h2_old = init_weights((hidden_size, hidden_size)) self._w_o_old = init_tanh(hidden_size, n_out) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = 0.00025 self._discount_factor = 0.99 self._weight_update_steps = 5000 self._updates = 0 # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, 2) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, 2) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.icol("Action") Action.tag.test_value = np.zeros((batch_size, 1), dtype=np.dtype('int32')) # Q_val = T.fmatrix() # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1))) # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True) _py_xA = self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0) _py_xB = self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.0, 0.0) self._y_predA = T.argmax(_py_xA, axis=1) self._y_predB = T.argmax(_py_xB, axis=1) self._q_funcA = T.mean( (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0))[T.arange(batch_size), Action.reshape((-1, ))].reshape((-1, 1))) self._q_funcB = T.mean( (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.0, 0.0))[T.arange(batch_size), Action.reshape((-1, ))].reshape((-1, 1))) # q_val = py_x # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5) # L1 norm ; one regularization option is to enforce L1 norm to # be small self._L1_A = (abs(self._w_h).sum() + abs(self._w_h2).sum() + abs(self._w_o).sum()) self._L1_B = (abs(self._w_h_old).sum() + abs(self._w_h2_old).sum() + abs(self._w_o_old).sum()) self._L1_reg = 0.0 self._L2_reg = 0.001 # L2 norm ; one regularization option is to enforce # L2 norm to be small self._L2_A = ((self._w_h**2).sum() + (self._w_h2**2).sum() + (self._w_o**2).sum()) self._L2_B = ((self._w_h_old**2).sum() + (self._w_h2_old**2).sum() + (self._w_o_old**2).sum()) # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State)) deltaA = ((Reward + (self._discount_factor * T.max(self.model( ResultState, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5), axis=1, keepdims=True))) - (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5))[T.arange(Action.shape[0]), Action.reshape((-1, ))].reshape((-1, 1))) deltaB = ( (Reward + (self._discount_factor * T.max(self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5), axis=1, keepdims=True))) - (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5))[T.arange(Action.shape[0]), Action.reshape((-1, ))].reshape((-1, 1))) # bellman_cost = T.mean( 0.5 * ((delta) ** 2 )) bellman_costA = T.mean(0.5 * ((deltaA)**2)) + ( self._L2_reg * self._L2_A) + (self._L1_reg * self._L1_A) bellman_costB = T.mean(0.5 * ((deltaB)**2)) + ( self._L2_reg * self._L2_B) + (self._L1_reg * self._L1_B) paramsA = [ self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o ] paramsB = [ self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old ] # updates = sgd(bellman_cost, params, lr=self._learning_rate) updatesA = rlTDSGD(self._q_funcA, T.mean(deltaA), paramsA, lr=self._learning_rate) updatesB = rlTDSGD(self._q_funcB, T.mean(deltaB), paramsB, lr=self._learning_rate) # updates = RMSprop(bellman_cost, params, lr=self._learning_rate) # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate) # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01) # updatesA = lasagne.updates.rmsprop(self._q_funcA, paramsA, self._learning_rate * -T.mean(deltaA), 0.95, 0.01) # updatesB = lasagne.updates.rmsprop(self._q_funcB, paramsB, self._learning_rate * -T.mean(deltaB), 0.95, 0.01) self._trainA = theano.function( inputs=[State, Action, Reward, ResultState], outputs=bellman_costA, updates=updatesA, allow_input_downcast=True) self._trainB = theano.function( inputs=[State, Action, Reward, ResultState], outputs=bellman_costB, updates=updatesB, allow_input_downcast=True) self._bellman_errorA = theano.function( inputs=[State, Action, Reward, ResultState], outputs=deltaA, allow_input_downcast=True) self._bellman_errorB = theano.function( inputs=[State, Action, Reward, ResultState], outputs=deltaB, allow_input_downcast=True) self._q_valuesA = theano.function(inputs=[State], outputs=_py_xA, allow_input_downcast=True) self._q_valuesB = theano.function(inputs=[State], outputs=_py_xB, allow_input_downcast=True) self._py_xA = theano.function(inputs=[State], outputs=_py_xA, allow_input_downcast=True) self._py_xB = theano.function(inputs=[State], outputs=_py_xB, allow_input_downcast=True) x, y = T.matrices('x', 'y') z_lazy = ifelse(T.gt(T.max(x, axis=1)[0], T.max(y, axis=1)[0]), T.argmax(x, axis=1), T.argmax(y, axis=1)) self._f_lazyifelse = theano.function([x, y], z_lazy, mode=theano.Mode(linker='vm'))
l_in = lasagne.layers.InputLayer( shape=(None, num_frames, input_width, input_height) ) l_conv = conv_layer( l_in, num_filters=16, filter_size=(8,8), stride=(4,4), ) return l_conv l_out = build_network() rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True), name='rewards') actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True), name='actions') givens = { rewards: rewards_shared, actions: actions_shared, }
def __init__(self, input_width, input_height, avail_actions, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, train_all, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.avail_actions = avail_actions self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.train_all = train_all lasagne.random.set_rng(self.rng) self.update_counter = 0 print "num_actions: " + str(num_actions) self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, input_width, input_height, output_dim, num_frames, batch_size): self.input_width = input_width self.input_height = input_height self.output_dim = output_dim self.num_frames = num_frames self.batch_size = batch_size self.gamma = 0.95 # discount factor self.rho = 0.99 self.lr = 0.00020 # learning rate self.momentum = 0.0 self.freeze_targets = False self.l_out = self.build_small_network(input_width, input_height, output_dim, num_frames, batch_size) if self.freeze_targets: self.next_l_out = self.build_small_network(input_width, input_height, output_dim, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') # terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True)) q_vals = self.l_out.get_output(states / 255.0) if self.freeze_targets: next_q_vals = self.next_l_out.get_output(next_states / 255.0) else: next_q_vals = self.l_out.get_output(next_states / 255.0) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = rewards + self.gamma * T.max( next_q_vals, axis=1, keepdims=True) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) loss = T.mean(diff**2) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, # terminals: self.terminals_shared } if self.momentum > 0: updates = rmsprop_nesterov(loss, params, self.lr, self.rho, self.momentum, 1e-2) else: updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, 1e-6) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def test_gi_stack(hyper_params=None, sup_count=600, rng_seed=1234): assert(not (hyper_params is None)) # Initialize a source of randomness rng = np.random.RandomState(rng_seed) # Load some data to train/validate/test with dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False) Xtr_su = datasets[0][0].get_value(borrow=False) Ytr_su = datasets[0][1].get_value(borrow=False) Xtr_un = datasets[1][0].get_value(borrow=False) Ytr_un = datasets[1][1].get_value(borrow=False) # get the unlabeled data Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX) Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]).astype(np.int32) Ytr_un = 0 * Ytr_un # get the labeled data Xtr_su = Xtr_su.astype(theano.config.floatX) Ytr_su = Ytr_su[:,np.newaxis].astype(np.int32) # get observations and labels for the validation set Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) Yva = Yva[:,np.newaxis] # numpy is dumb # get size information for the data un_samples = Xtr_un.shape[0] su_samples = Xtr_su.shape[0] va_samples = Xva.shape[0] # Construct a GenNet and an InfNet, then test constructor for GIPair. # Do basic testing, to make sure classes aren't completely broken. Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') Yd = T.icol('Yd_base') data_dim = Xtr_un.shape[1] label_dim = 10 prior_dim = 50 prior_sigma = 1.0 batch_size = 150 # Choose some parameters for the generator network gn_params = {} gn_config = [prior_dim, 600, 600, data_dim] gn_params['mlp_config'] = gn_config gn_params['activation'] = softplus_actfun gn_params['lam_l2a'] = 1e-3 gn_params['vis_drop'] = 0.0 gn_params['hid_drop'] = 0.0 gn_params['bias_noise'] = 0.1 # choose some parameters for the continuous inferencer in_params = {} shared_config = [data_dim, 600, 600] top_config = [shared_config[-1], prior_dim] in_params['shared_config'] = shared_config in_params['mu_config'] = top_config in_params['sigma_config'] = top_config in_params['activation'] = softplus_actfun in_params['init_scale'] = 2.0 in_params['lam_l2a'] = 1e-3 in_params['vis_drop'] = 0.0 in_params['hid_drop'] = 0.0 in_params['bias_noise'] = 0.1 in_params['input_noise'] = 0.1 # choose some parameters for the categorical inferencer pn_params = {} pc0 = [prior_dim, 800, 800, label_dim] pn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} sc1 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} pn_params['spawn_configs'] = [sc0, sc1] pn_params['spawn_weights'] = [0.5, 0.5] # Set remaining params pn_params['activation'] = relu_actfun pn_params['init_scale'] = 2.0 pn_params['ear_type'] = 6 pn_params['lam_l2a'] = 1e-3 pn_params['vis_drop'] = 0.0 pn_params['hid_drop'] = 0.5 # Initialize the base networks for this GIPair GN = GenNet(rng=rng, Xp=Xp, prior_sigma=prior_sigma, \ params=gn_params, shared_param_dicts=None) IN = InfNet(rng=rng, Xd=Xd, Xc=Xc, Xm=Xm, prior_sigma=prior_sigma, \ params=in_params, shared_param_dicts=None) PN = PeaNet(rng=rng, Xd=Xd, params=pn_params) # Initialize biases in GN, IN, and PN GN.init_biases(0.0) IN.init_biases(0.0) PN.init_biases(0.1) # Initialize the GIStack GIS = GIStack(rng=rng, \ Xd=Xd, Yd=Yd, Xc=Xc, Xm=Xm, \ g_net=GN, i_net=IN, p_net=PN, \ data_dim=data_dim, prior_dim=prior_dim, \ label_dim=label_dim, batch_size=batch_size, \ params=None, shared_param_dicts=None) # set weighting parameters for the various costs... GIS.set_lam_nll(1.0) GIS.set_lam_kld(1.0) GIS.set_lam_cat(0.0) GIS.set_lam_pea(0.0) GIS.set_lam_ent(0.0) # Set initial learning rate and basic SGD hyper parameters num_updates = hyper_params['num_updates'] learn_rate = hyper_params['learn_rate'] lam_pea = hyper_params['lam_pea'] lam_cat = hyper_params['lam_cat'] lam_ent = hyper_params['lam_ent'] lam_l2w = hyper_params['lam_l2w'] out_name = hyper_params['out_name'] out_file = open(out_name, 'wb') out_file.write("**TODO: More informative output, and maybe a real log**\n") out_file.write("sup_count: {0:d}\n".format(sup_count)) out_file.write("learn_rate: {0:.4f}\n".format(learn_rate)) out_file.write("lam_pea: {0:.4f}\n".format(lam_pea)) out_file.write("lam_cat: {0:.4f}\n".format(lam_cat)) out_file.write("lam_ent: {0:.4f}\n".format(lam_ent)) out_file.write("lam_l2w: {0:.4f}\n".format(lam_l2w)) out_file.flush() GIS.set_lam_l2w(lam_l2w) GIS.set_all_sgd_params(learn_rate=learn_rate, momentum=0.98) for i in range(num_updates): if (i < 100000): # start with some updates only for the VAE (InfNet and GenNet) scale = float(min(i+1, 50000)) / 50000.0 lam_cat = 0.0 lam_pea = 0.0 lam_ent = 0.0 learn_rate_pn = 0.0 else: # move on to updates that include loss from the PeaNet scale = 1.0 lam_cat = hyper_params['lam_cat'] lam_pea = hyper_params['lam_pea'] if i < 150000: lam_ent = float(i - 99999) * hyper_params['lam_ent'] else: lam_ent = hyper_params['lam_ent'] learn_rate_pn = learn_rate if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.7 # do a minibatch update using unlabeled data if True: # get some data to train with un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) Xd_un = binarize_data(Xtr_un.take(un_idx, axis=0)) Yd_un = Ytr_un.take(un_idx, axis=0) Xc_un = 0.0 * Xd_un Xm_un = 0.0 * Xd_un # do a minibatch update of the model, and compute some costs GIS.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIS.set_pn_sgd_params(learn_rate=(scale*learn_rate_pn), momentum=0.98) GIS.set_lam_nll(1.0) GIS.set_lam_kld(0.01 + (0.99*scale)) GIS.set_lam_cat(0.0) GIS.set_lam_pea(lam_pea) GIS.set_lam_ent(lam_ent) outputs = GIS.train_joint(Xd_un, Xc_un, Xm_un, Yd_un) joint_cost = 1.0 * outputs[0] data_nll_cost = 1.0 * outputs[1] post_kld_cost = 1.0 * outputs[2] post_cat_cost = 1.0 * outputs[3] post_pea_cost = 1.0 * outputs[4] post_ent_cost = 1.0 * outputs[5] other_reg_cost = 1.0 * outputs[6] # do another minibatch update incorporating label information if (i >= 100000): # get some data to train with su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) Xd_su = binarize_data(Xtr_su.take(su_idx, axis=0)) Yd_su = Ytr_su.take(su_idx, axis=0) Xc_su = 0.0 * Xd_su Xm_su = 0.0 * Xd_su # update only based on the label-based classification cost GIS.set_all_sgd_params(learn_rate=(scale*learn_rate), momentum=0.98) GIS.set_pn_sgd_params(learn_rate=(scale*learn_rate_pn), momentum=0.98) GIS.set_lam_nll(0.0) GIS.set_lam_kld(0.0) GIS.set_lam_cat(lam_cat) GIS.set_lam_pea(lam_pea) GIS.set_lam_ent(0.0) outputs = GIS.train_joint(Xd_su, Xc_su, Xm_su, Yd_su) post_cat_cost = 1.0 * outputs[3] assert(not (np.isnan(joint_cost))) if ((i % 500) == 0): o_str = "batch: {0:d}, joint_cost: {1:.4f}, nll: {2:.4f}, kld: {3:.4f}, cat: {4:.4f}, pea: {5:.4f}, ent: {6:.4f}, other_reg: {7:.4f}".format( \ i, joint_cost, data_nll_cost, post_kld_cost, post_cat_cost, post_pea_cost, post_ent_cost, other_reg_cost) print(o_str) out_file.write("{}\n".format(o_str)) if ((i % 1000) == 0): # check classification error on training and validation set train_err = GIS.classification_error(Xtr_su, Ytr_su) va_err = GIS.classification_error(Xva, Yva) o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) print(o_str) out_file.write("{}\n".format(o_str)) out_file.flush() if ((i % 5000) == 0): file_name = "GIS_SAMPLES_b{0:d}.png".format(i) va_idx = npr.randint(low=0,high=va_samples,size=(5,)) Xd_samps = np.vstack([Xd_un[0:5,:], binarize_data(Xva[va_idx,:])]) Xd_samps = np.repeat(Xd_samps, 3, axis=0) sample_lists = GIS.sample_gis_from_data(Xd_samps, loop_iters=10) Xs = np.vstack(sample_lists["data samples"]) Ys = GIS.class_probs(Xs) Xs = mnist_prob_embed(Xs, Ys) utils.visualize_samples(Xs, file_name) print("TESTING COMPLETE!") out_file.close() return
def initialize_network(self): """ :description: this method initializes the network, updates, and theano functions for training and retrieving q values. Here's an outline: 1. build the q network and target q network 2. initialize theano symbolic variables used for compiling functions 3. initialize the theano numeric variables used as input to functions 4. formulate the symbolic loss 5. formulate the symbolic updates 6. compile theano functions for training and for getting q_values """ batch_size, input_shape = self.batch_size, self.input_shape lasagne.random.set_rng(self.rng) # 1. build the q network and target q network self.l_out = self.build_network(input_shape, self.num_actions, batch_size) self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size) self.reset_target_network() # 2. initialize theano symbolic variables used for compiling functions states = T.tensor4('states') actions = T.icol('actions') rewards = T.col('rewards') next_states = T.tensor4('next_states') # terminals are used to indicate a terminal state in the episode and hence a mask over the future # q values i.e., Q(s',a') terminals = T.icol('terminals') # 3. initialize the theano numeric variables used as input to functions self.states_shape = (batch_size,) + (1,) + input_shape self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # 4. formulate the symbolic loss q_vals = lasagne.layers.get_output(self.l_out, states) next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector' diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # a lot of the deepmind work clips the td error at 1 so we do that here # the problem is that gradient backpropagating through this minimum node # will be zero if diff is larger then 1.0 (because changing params before # the minimum does not impact the output of the minimum). To account for # this we take the part of the td error (magnitude) greater than 1.0 and simply # add it to the loss, which allows gradient to backprop but just linearly # in the td error rather than quadratically quadratic_part = T.minimum(abs(diff), 1.0) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + linear_part loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2) # 5. formulate the symbolic updates params = lasagne.layers.helper.get_all_params(self.l_out) updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate) # 6. compile theano functions for training and for getting q_values givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, env, args, rng, name = "DQNLasagne"): """ Initializes a network based on the Lasagne Theano framework. Args: env (AtariEnv): The envirnoment in which the agent actuates. args (argparse.Namespace): All settings either with a default value or set via command line arguments. rng (mtrand.RandomState): Initialized Mersenne Twister pseudo-random number generator. name (str): The name of the network object. Note: This function should always call the base class first to initialize the common values for the networks. """ _logger.info("Initialize object of type " + str(type(self).__name__)) super(DQNLasagne, self).__init__(env, args, rng, name) self.input_shape = (self.batch_size, self.sequence_length, args.frame_width, args.frame_height) self.dummy_batch = np.zeros(self.input_shape, dtype=np.uint8) lasagne.random.set_rng(self.rng) self.network = self._create_layer() # TODO: Load weights from pretrained network?! if not self.args.load_weights == None: self.load_weights(self.args.load_weights) if self.target_update_frequency > 0: self.target_network = self._create_layer() self._copy_theta() states = T.tensor4('states') followup_states = T.tensor4('followup_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros(self.input_shape, dtype=theano.config.floatX) ) self.followup_states_shared = theano.shared( np.zeros(self.input_shape, dtype=theano.config.floatX) ) self.rewards_shared = theano.shared( np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True) ) self.actions_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True) ) self.terminals_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True) ) qvalues = lasagne.layers.get_output( self.network, self._prepare_network_input(states) ) if self.target_update_frequency > 0: qvalues_followup_states = lasagne.layers.get_output( self.target_network, self._prepare_network_input(followup_states) ) else: qvalues_followup_states = lasagne.layers.get_output( self.network, self._prepare_network_input(followup_states) ) qvalues_followup_states = theano.gradient.disconnected_grad(qvalues_followup_states) targets = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(qvalues_followup_states, axis=1, keepdims=True) ) errors = targets - qvalues[ T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_error > 0: quadratic_part = T.minimum(abs(errors), self.clip_error) linear_part = abs(errors) - quadratic_part cost_function = T.sum(0.5 * quadratic_part ** 2 + self.clip_error * linear_part) else: cost_function = T.sum(0.5 * errors ** 2) self.params = lasagne.layers.helper.get_all_params(self.network) self.observations = { states: self.states_shared, followup_states: self.followup_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._set_optimizer(cost_function) if self.momentum > 0: self.optimizer = lasagne.updates.apply_momentum( self.optimizer, None, self.momentum ) _logger.debug("Compiling _theano_train") self._theano_train = theano.function( [], [cost_function, qvalues], updates=self.optimizer, givens=self.observations) _logger.debug("Compiling _theano_get_Q") self._theano_get_Q = theano.function( [], qvalues, givens={states: self.states_shared}) self.callback = None _logger.debug("%s" % self)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, update_rule, batch_accumulator, state_count, input_scale=255.0): self.state_count=state_count self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.matrix('states') next_states = T.matrix('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') #buferis inputu viso batch self.states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #buferis i koki state patenka visiem self.next_states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu? self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) #po 1 priimta action kiekvienam episode self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #?? turbut 0 ir 1, ar paskutine verte ar ne self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) #neaisku if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff ** 2) elif batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) # params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'adam': updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho, self.rms_epsilon) elif update_rule == 'adagrad': updates = lasagne.updates.adagrad(loss, params, self.lr, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) elif update_rule == 'momentum': updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, update_rule, batch_accumulator, state_count, input_scale=255.0): self.state_count = state_count self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_nature_network_dnn( input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.matrix('states') next_states = T.matrix('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') #buferis inputu viso batch self.states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #buferis i koki state patenka visiem self.next_states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu? self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) #po 1 priimta action kiekvienam episode self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #?? turbut 0 ir 1, ar paskutine verte ar ne self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) #neaisku if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff**2) elif batch_accumulator == 'mean': loss = T.mean(diff**2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) # params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'adam': updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho, self.rms_epsilon) elif update_rule == 'adagrad': updates = lasagne.updates.adagrad(loss, params, self.lr, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) elif update_rule == 'momentum': updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def icol(name): return T.icol(name)
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, update_rule, batch_accumulator, randomState, frame_scale=255.0): """ Initialize environment Arguments: environment - the environment (class Env) num_elements_in_batch - list of k integers for the number of each element kept as belief state num_actions - int discount - float learning_rate - float rho, rms_epsilon, momentum - float, float, float ... network_type - string ... """ self._environment = environment self._batchSize = batchSize self._inputDimensions = self._environment.inputDimensions() self._nActions = self._environment.nActions() self._df = 0 self.rho = rho self._lr = 0 self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self._randomState = randomState lasagne.random.set_rng(self._randomState) self.update_counter = 0 states=[] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states=[] # idem than states at t+1 self.states_shared=[] # list of shared variable for each of the k element in the belief state self.next_states_shared=[] # idem that self.states_shared at t+1 for i, dim in enumerate(self._inputDimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append( T.matrix("%s_%s" % ("state", i)) ) next_states.append( T.matrix("%s_%s" % ("next_state", i)) ) self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) print("Number of observations per state: {}".format(len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states) print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv)) self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states) self._resetQHat() self.rewards_shared = theano.shared( np.zeros((batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out) next_q_vals = lasagne.layers.get_output(self.next_l_out) max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True) T_ones_like=T.ones_like(T.ones_like(terminals) - terminals) target = rewards + T_ones_like * thediscount * max_next_q_vals q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1)) diff = target - q_val if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) for conv_param in self.l_outs_conv: for p in lasagne.layers.helper.get_all_params(conv_param): params.append(p) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[ states[i] ] = x for i, x in enumerate(self.next_states_shared): givens[ next_states[i] ] = x if update_rule == 'deepmind_rmsprop': grads = get_or_compute_grads(loss, params) updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, thelr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2={} for i, x in enumerate(self.states_shared): givens2[ states[i] ] = x self._q_vals = theano.function([], q_vals, givens=givens2, on_unused_input='warn')
def __init__(self, env, args, rng, name="DQNLasagne"): """ Initializes a network based on the Lasagne Theano framework. Args: env (AtariEnv): The envirnoment in which the agent actuates. args (argparse.Namespace): All settings either with a default value or set via command line arguments. rng (mtrand.RandomState): Initialized Mersenne Twister pseudo-random number generator. name (str): The name of the network object. Note: This function should always call the base class first to initialize the common values for the networks. """ _logger.info("Initialize object of type " + str(type(self).__name__)) super(DQNLasagne, self).__init__(env, args, rng, name) self.input_shape = (self.batch_size, self.sequence_length, args.frame_width, args.frame_height) self.dummy_batch = np.zeros(self.input_shape, dtype=np.uint8) lasagne.random.set_rng(self.rng) self.network = self._create_layer() # TODO: Load weights from pretrained network?! if not self.args.load_weights == None: self.load_weights(self.args.load_weights) if self.target_update_frequency > 0: self.target_network = self._create_layer() self._copy_theta() states = T.tensor4('states') followup_states = T.tensor4('followup_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros(self.input_shape, dtype=theano.config.floatX)) self.followup_states_shared = theano.shared( np.zeros(self.input_shape, dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) qvalues = lasagne.layers.get_output( self.network, self._prepare_network_input(states)) if self.target_update_frequency > 0: qvalues_followup_states = lasagne.layers.get_output( self.target_network, self._prepare_network_input(followup_states)) else: qvalues_followup_states = lasagne.layers.get_output( self.network, self._prepare_network_input(followup_states)) qvalues_followup_states = theano.gradient.disconnected_grad( qvalues_followup_states) targets = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(qvalues_followup_states, axis=1, keepdims=True)) errors = targets - qvalues[T.arange(self.batch_size), actions.reshape((-1, ))].reshape((-1, 1)) if self.clip_error > 0: quadratic_part = T.minimum(abs(errors), self.clip_error) linear_part = abs(errors) - quadratic_part cost_function = T.sum(0.5 * quadratic_part**2 + self.clip_error * linear_part) else: cost_function = T.sum(0.5 * errors**2) self.params = lasagne.layers.helper.get_all_params(self.network) self.observations = { states: self.states_shared, followup_states: self.followup_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._set_optimizer(cost_function) if self.momentum > 0: self.optimizer = lasagne.updates.apply_momentum( self.optimizer, None, self.momentum) _logger.debug("Compiling _theano_train") self._theano_train = theano.function([], [cost_function, qvalues], updates=self.optimizer, givens=self.observations) _logger.debug("Compiling _theano_get_Q") self._theano_get_Q = theano.function( [], qvalues, givens={states: self.states_shared}) self.callback = None _logger.debug("%s" % self)
def __init__(self, num_actions, phi_length, width, height, discount, learning_rate, decay, momentum=0, batch_size=32, approximator='none'): self._batch_size = batch_size self._num_input_features = phi_length self._phi_length = phi_length self._img_width = width self._img_height = height self._discount = discount self.num_actions = num_actions self.learning_rate = learning_rate self.decay = decay self.momentum = momentum self.scale_input_by = 255.0 # CONSTRUCT THE LAYERS self.q_layers = [] self.q_layers.append(layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_height, self._img_width, self.scale_input_by)) if approximator == 'cuda_conv': self.q_layers.append(cc_layers.ShuffleBC01ToC01BLayer( self.q_layers[-1])) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=16, filter_size=8, stride=4, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=32, filter_size=4, stride=2, weights_std=.01, init_bias_value=0.1)) self.q_layers.append(cc_layers.ShuffleC01BToBC01Layer( self.q_layers[-1])) elif approximator == 'conv': self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1], n_filters=16, filter_width=8, filter_height=8, stride_x=4, stride_y=4, weights_std=.01, init_bias_value=0.01)) self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1], n_filters=32, filter_width=4, filter_height=4, stride_x=2, stride_y=2, weights_std=.01, init_bias_value=0.01)) if approximator == 'cuda_conv' or approximator == 'conv': self.q_layers.append(layers.DenseLayer(self.q_layers[-1], n_outputs=256, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.rectify)) self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=num_actions, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.identity)) if approximator == 'none': self.q_layers.append(\ layers.DenseLayerNoBias(self.q_layers[-1], n_outputs=num_actions, weights_std=0.00, dropout=0, nonlinearity=layers.identity)) self.q_layers.append(layers.OutputLayer(self.q_layers[-1])) for i in range(len(self.q_layers)-1): print self.q_layers[i].get_output_shape() # Now create a network (using the same weights) # for next state q values self.next_layers = copy_layers(self.q_layers) self.next_layers[0] = layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_width, self._img_height, self.scale_input_by) self.next_layers[1].input_layer = self.next_layers[0] self.rewards = T.col() self.actions = T.icol() # Build the loss function ... q_vals = self.q_layers[-1].predictions() next_q_vals = self.next_layers[-1].predictions() next_maxes = T.max(next_q_vals, axis=1, keepdims=True) target = self.rewards + discount * next_maxes target = theano.gradient.consider_constant(target) diff = target - q_vals # Zero out all entries for actions that were not chosen... mask = build_mask(T.zeros_like(diff), self.actions, 1.0) diff_masked = diff * mask error = T.mean(diff_masked ** 2) self._loss = error * diff_masked.shape[1] # self._parameters = layers.all_parameters(self.q_layers[-1]) self._idx = T.lscalar('idx') # CREATE VARIABLES FOR INPUT AND OUTPUT self.states_shared = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.states_shared_next = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((1, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((1, 1), dtype='int32'), broadcastable=(False, True)) self._givens = \ {self.q_layers[0].input_var: self.states_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.next_layers[0].input_var: self.states_shared_next[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.rewards: self.rewards_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :], self.actions: self.actions_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :] } if self.momentum != 0: self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\ self._loss, self._parameters, learning_rate=self.learning_rate, rho=self.decay, momentum=self.momentum, epsilon=1e-6) else: self._updates = layers.gen_updates_rmsprop(self._loss, self._parameters, learning_rate=self.learning_rate, rho=self.decay, epsilon=1e-6) self._train = theano.function([self._idx], self._loss, givens=self._givens, updates=self._updates) self._compute_loss = theano.function([self._idx], self._loss, givens=self._givens) self._compute_q_vals = \ theano.function([self.q_layers[0].input_var], self.q_layers[-1].predictions(), on_unused_input='ignore')
def train(): # initialize game direction, launchBubble, newBubble, arrow, bubbleArray, nextBubble, score, alive, shots, getout, loss_game = restartGame( ) # hyperparameters epsilon = 0.9 # counters moves = 0 wins = 0 gameover = 0 games = 0 average_loss = 0 average_reward = 0 # with or without display display = False delay = 0 # Tensor types STATE = T.tensor4() NEWSTATE = T.tensor4() REWARD = T.icol() DISCOUNT = T.col() ACTION = T.icol() # building network network = build_network() target_network = build_network() # get parameters from trained network """ with np.load('5_colours_20shots.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] lasagne.layers.set_all_param_values(network, param_values)""" params = lasagne.layers.get_all_params(network) all_params = lasagne.layers.helper.get_all_param_values(network) lasagne.layers.helper.set_all_param_values(target_network, all_params) # get maximum q_value and particular action qvals = lasagne.layers.get_output(network, STATE) bestAction = qvals.argmax(-1) qval = qvals[0][ACTION] # get max Q_value of next state next_q_vals = lasagne.layers.get_output(target_network, NEWSTATE) maxNextValue = next_q_vals.max() # loss function with Stochastic Gradient Descent target = (REWARD + DISCOUNT * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - qvals[T.arange(BATCHSIZE), ACTION.reshape((-1, ))].reshape((-1, 1)) loss = 0.5 * diff**2 loss = T.mean(loss) grad = T.grad(loss, params) updates = lasagne.updates.rmsprop(grad, params, learning_rate) updates = lasagne.updates.apply_momentum(updates, params, 0.9) # theano function for training and predicting q_values f_train = theano.function([STATE, ACTION, REWARD, NEWSTATE, DISCOUNT], loss, updates=updates, allow_input_downcast=True) f_predict = theano.function([STATE], bestAction, allow_input_downcast=True) f_qvals = theano.function([STATE], qvals, allow_input_downcast=True) f_max = theano.function([NEWSTATE], maxNextValue, allow_input_downcast=True) # get state state = gameState(bubbleArray, newBubble.color) while moves < ITERATIONS: if display == True: DISPLAYSURF.fill(BGCOLOR) # act random or greedy chance = random.uniform(0, 1) launchBubble = True if chance < epsilon: action = random.randint(0, NUMBEROFACTIONS - 1) else: predict_state = np.reshape(state, (1, 8, GRIDSIZE * 2, ARRAYWIDTH * 2)) action = int(f_predict(predict_state)) direction = (action * 8) + 10 newBubble.angle = direction # process game bubbleArray, alive, deleteList, nextBubble = processGame( launchBubble, newBubble, bubbleArray, score, arrow, direction, alive, display, delay) # get reward for the action getout, wins, reward, gameover = getReward(alive, getout, wins, deleteList, gameover) # getting new bubble for shooting newBubble = Bubble(nextBubble.color) newBubble.angle = arrow.angle # get the newstate newState = gameState(bubbleArray, newBubble.color) # storage of replay memory if getout == True: REPLAYMEMORY.append((state, action, reward, newState, 0)) else: REPLAYMEMORY.append((state, action, reward, newState, discount)) # delete one tuple is replay memory becomes too big if len(REPLAYMEMORY) > size_RM: REPLAYMEMORY.pop(0) # training the network states, actions, rewards, newstates, discounts = get_batch() loss = f_train(states, actions, rewards, newstates, discounts) average_loss = average_loss + loss average_reward = average_reward + reward if moves % 1000 == 0 and moves > 0: print("Amount of actions taken: ", moves) print("Average loss: ", average_loss / 1000.0) print("Average Reward: ", average_reward / 1000.0) print("Amount of wins: ", wins) average_reward = 0 average_loss = 0 if epsilon > 0.1: epsilon = epsilon - 0.01 # updating the target network if moves % 2500 == 0: target_network = build_network() all_param_values = lasagne.layers.get_all_param_values(network) lasagne.layers.set_all_param_values(target_network, all_param_values) # change the state to newState state = newState moves = moves + 1 shots = shots + 1 if getout == True or shots == AMOUNTOFSHOTS: games = games + 1 direction, launchBubble, newBubble, arrow, bubbleArray, nextBubble, score, alive, shots, getout, loss_game = restartGame( ) state = gameState(bubbleArray, newBubble.color) # saving parameters of the network np.savez('model.npz', *lasagne.layers.get_all_param_values(network)) return network
def __init__(self, stateSize, actionSize, numFrames, batchSize, discount, rho, momentum, learningRate, rmsEpsilon, rng, updateRule, batchAccumulator, freezeInterval): self.stateSize = stateSize self.actionSize = actionSize self.numFrames = numFrames self.batchSize = batchSize self.discount = discount self.rho = rho self.momentum = momentum self.learningRate = learningRate self.rmsEpsilon = rmsEpsilon self.rng = rng self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.freezeInterval = freezeInterval lasagne.random.set_rng(self.rng) self.updateCounter = 0 self.lOut = self.buildNetwork(self.stateSize, self.actionSize, self.numFrames, self.batchSize) if self.freezeInterval > 0: self.nextLOut = self.buildNetwork(self.stateSize, self.actionSize, self.numFrames, self.batchSize) self.resetQHat() states = T.ftensor3('states') nextStates = T.ftensor3('nextStates') rewards = T.fcol('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Shared variables for teaching from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) states, along with the chosen action and resulting # reward and termninal status. self.states_shared = theano.shared( numpy.zeros((self.batchSize, self.numFrames + 1, self.stateSize), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(numpy.zeros( (self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(numpy.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(numpy.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate qVals self.state_shared = theano.shared( numpy.zeros((self.numFrames, self.stateSize), dtype=theano.config.floatX)) qVals = lasagne.layers.get_output(self.lOut, states) if self.freezeInterval > 0: nextQVals = lasagne.layers.get_output(self.nextLOut, nextStates) else: nextQVals = lasagne.layers.get_output(self.lOut, nextStates) nextQVals = theano.gradient.disconnected_grad(nextQVals) # Cast terminals to floatX terminalsX = terminals.astype(theano.config.floatX) # T.eq(a,b) returns a variable representing the nogical # EQuality (a==b) actionmask = T.eq( T.arange(self.actionSize).reshape((1, -1)), actions.reshape( (-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(nextQVals, axis=1, keepdims=True)) output = (qVals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output # no if clip delta, since clip-delta=0 loss = (diff**2) if self.batchAccumulator == 'sum': loss = T.sum(loss) elif self.batchAccumulator == 'mean': loss = T.mean(loss) else: raise ValueError('Bad accumulator: {}'.format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.lOut) train_givens = { states: self.states_shared[:, :-1], nextStates: self.states_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if self.updateRule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.learningRate, self.rho, self.rmsEpsilon) elif self.updateRule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.learningRate, self.rho, self.rmsEpsilon) else: raise ValueError('Unrecognized update: {}'.format(updateRule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape( (1, self.numFrames, self.stateSize)) } # self._q_vals=theano.function([],qVals[0], givens=q_givens) self._q_vals = theano.function([], qVals[0], givens=q_givens)
def __init__(self, input_size, output_size, hidden_units, train_iterations=50000, eps=1.0, batch_size=10, discount_factor=0.0, reg_factor=0.0, lr=0.001, train=True): self.batch_size = 10 self.init_eps = eps self.epsilon = eps self.iterations = train_iterations self.num_experiences = 0 self.train = train state_length = 1 # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, state_length) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, state_length) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.icol("Action") Action.tag.test_value = np.zeros((batch_size, 1), dtype=np.dtype('int32')) # create 2 separate neural network l_inA = lasagne.layers.InputLayer((None, state_length), State) l_inB = lasagne.layers.InputLayer((None, state_length), State) for units in hidden_units: l_hiddenA = lasagne.layers.DenseLayer( l_inA, num_units=units, nonlinearity=lasagne.nonlinearities.rectify) l_hiddenB = lasagne.layers.DenseLayer( l_inB, num_units=units, nonlinearity=lasagne.nonlinearities.rectify) l_inA = l_hiddenA l_inB = l_hiddenB self._l_outA = lasagne.layers.DenseLayer( l_inA, num_units=output_size, nonlinearity=lasagne.nonlinearities.linear) self._l_outB = lasagne.layers.DenseLayer( l_inB, num_units=output_size, nonlinearity=lasagne.nonlinearities.linear) self._learning_rate = lr self._discount_factor = discount_factor self._rho = 0.95 self._rms_epsilon = 0.005 self._weight_update_steps = 100 self._updates = 0 self._states_shared = theano.shared( np.zeros((batch_size, state_length), dtype=theano.config.floatX)) self._next_states_shared = theano.shared( np.zeros((batch_size, state_length), dtype=theano.config.floatX)) self._rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self._actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True), allow_downcast=True) self._q_valsA = lasagne.layers.get_output(self._l_outA, State) self._q_valsB = lasagne.layers.get_output(self._l_outB, ResultState) self._q_func = self._q_valsA[T.arange(batch_size), Action.reshape((-1, ))].reshape((-1, 1)) target = ( Reward + # (T.ones_like(terminals) - terminals) * self._discount_factor * T.max(self._q_valsB, axis=1, keepdims=True)) diff = target - self._q_valsA[T.arange(batch_size), Action.reshape((-1, ))].reshape((-1, 1)) loss = 0.5 * diff**2 loss = T.mean(loss) params = lasagne.layers.helper.get_all_params(self._l_outA) givens = { State: self._states_shared, ResultState: self._next_states_shared, Reward: self._rewards_shared, Action: self._actions_shared, } # SGD update updates = lasagne.updates.rmsprop(loss, params, self._learning_rate, self._rho, self._rms_epsilon) # TD update # updates = lasagne.updates.rmsprop(T.mean(self._q_func), params, self._learning_rate * -T.mean(diff), self._rho, # self._rms_epsilon) self._train = theano.function([], [loss, self._q_valsA], updates=updates, givens=givens) self._q_vals = theano.function([], self._q_valsA, givens={State: self._states_shared}) self._bellman_error = theano.function( inputs=[State, Action, Reward, ResultState], outputs=diff, allow_input_downcast=True)
def initialize_network(self): """ :description: this method initializes the network, updates, and theano functions for training and retrieving q values. Here's an outline: 1. build the q network and target q network 2. initialize theano symbolic variables used for compiling functions 3. initialize the theano numeric variables used as input to functions 4. formulate the symbolic loss 5. formulate the symbolic updates 6. compile theano functions for training and for getting q_values """ batch_size, input_shape = self.batch_size, self.input_shape lasagne.random.set_rng(self.rng) # 1. build the q network and target q network self.l_out = self.build_network(input_shape, self.num_actions, batch_size) self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size) self.reset_target_network() # 2. initialize theano symbolic variables used for compiling functions states = T.tensor4('states') actions = T.icol('actions') rewards = T.col('rewards') next_states = T.tensor4('next_states') # terminals are used to indicate a terminal state in the episode and hence a mask over the future # q values i.e., Q(s',a') terminals = T.icol('terminals') # 3. initialize the theano numeric variables used as input to functions self.states_shape = (batch_size, ) + (1, ) + input_shape self.states_shared = theano.shared( np.zeros(self.states_shape, dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros(self.states_shape, dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # 4. formulate the symbolic loss q_vals = lasagne.layers.get_output(self.l_out, states) next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector' diff = target - q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) # a lot of the deepmind work clips the td error at 1 so we do that here # the problem is that gradient backpropagating through this minimum node # will be zero if diff is larger then 1.0 (because changing params before # the minimum does not impact the output of the minimum). To account for # this we take the part of the td error (magnitude) greater than 1.0 and simply # add it to the loss, which allows gradient to backprop but just linearly # in the td error rather than quadratically quadratic_part = T.minimum(abs(diff), 1.0) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part**2 + linear_part loss = T.mean(loss) + self.regularization * regularize_network_params( self.l_out, l2) # 5. formulate the symbolic updates params = lasagne.layers.helper.get_all_params(self.l_out) updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate) # 6. compile theano functions for training and for getting q_values givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._get_q_values = theano.function( [], q_vals, givens={states: self.states_shared})
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng # print "NETWORK---------------------------" # print "input width ", self.input_width # print "input height", self.input_height # print "num actiuons", self.num_actions # print "num frames", self.num_frames # print "batch size", self.batch_size # print "discount", self.discount # print "rho", self.rho # print "lr", self.lr # print "rms_epsilon", self.rms_epsilon # print "momentum", self.momentum # print "clip_delta", self.clip_delta # print "freeze_ intercal", self.freeze_interval # print "rng", self.rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Shared variables for training from a minibatch of replayed state transitions, # each consisting of num_frames + 1 (due to overlap) images, along with # the chosen action and resulting reward and termnial status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames + 1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate q_vals self.state_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), actions.reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.num_frames, self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
def __init__(self, n_time, input_width, input_height, num_hidden, num_LSTM_units, discount, learning_rate, rho, rms_epsilon, momentum, batch_size, update_rule, actions, file='', clip_delta=0, input_scale=1.0): CompleteLearner.__init__(self, actions, file) self.input_width = input_width self.input_height = input_height self.num_actions = len(actions) self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.rng = lasagne.random.get_rng() self.cycles = 0 self.batch_size = batch_size self.n_time = n_time #lasagne.random.set_rng(self.rng) self.update_counter = 0 self.network = self.build_network( (n_time, batch_size, input_width, input_height), num_hidden, num_LSTM_units, self.num_actions) states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') # Shared variables for training from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) images, along with the chosen action and resulting # reward (no terminal state) self.obss_shared = theano.shared( np.zeros((batch_size, n_time, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # no terminal states # Shared variable for a single state, to calculate q_vals. self.state_shared = theano.shared( np.zeros((input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.network, states / input_scale) next_q_vals = lasagne.layers.get_output(self.network, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) actionmask = T.eq( T.arange(self.num_actions).reshape((1, -1)), actions.reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part else: loss = 0.5 * diff**2 batch_accumulator = 'mean' if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.network) train_givens = { states: self.obss_shared[:, :-1], #get all except the last next_states: self.obss_shared[:, 1:], #get all except the first rewards: self.rewards_shared, actions: self.actions_shared, } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rms_prop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'adam': updates = lasagne.updates.adam(loss, params, self.lr, epsilon=self.rms_epsilon) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
Ytr_su = Ytr_su[:,np.newaxis] # get observations and labels for the validation set Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) Yva = Yva[:,np.newaxis] # numpy is dumb # get size information for the data un_samples = Xtr_un.shape[0] su_samples = Xtr_su.shape[0] va_samples = Xva.shape[0] # set up some symbolic variables for input to the PeaNetSeq Xp = T.matrix('Xp_base') Xd = T.matrix('Xd_base') Xc = T.matrix('Xc_base') Xm = T.matrix('Xm_base') Yd = T.icol('Yd_base') # set some "shape" parameters for the networks data_dim = Xtr_un.shape[1] label_dim = 10 prior_dim = 25 prior_sigma = 1.0 batch_size = 100 # we'll take 2x this per batch, for sup and unsup ################################################################# # Construct the generator and inferencer to use for conditional # # generation of adversarial examples. # ################################################################# # Choose some parameters for the generator network gn_params = {} gn_config = [prior_dim, 800, 800, data_dim] gn_params['mlp_config'] = gn_config
def __init__(self, input_width, input_height, num_channels, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, network_params, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_channels = num_channels self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.lstm = None self.next_lstm = None logging.debug('network parameters', network_params) self.network_params = network_params lasagne.random.set_rng(self.rng) self.update_counter = 0 networks = self.build_network(network_type, num_channels, input_width, input_height, num_actions, num_frames, None) if isinstance(networks, tuple): self.l_out = networks[0] self.lstm = networks[1] else: self.l_out = networks # theano.compile.function_dump('network.dump', self.l_out) if self.freeze_interval > 0: next_networks = self.build_network(network_type, num_channels, input_width, input_height, num_actions, num_frames, None) if isinstance(next_networks, tuple): self.next_l_out = next_networks[0] self.next_lstm = next_networks[1] else: self.next_l_out = next_networks self.reset_q_hat() # This really really needs to be floats for now. # It makes sense if they use it for computations btensor5 = T.TensorType(theano.config.floatX, (False,) * 5) states = btensor5('states') next_states = btensor5('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Apparently needed for some layers with a variable input size # Weird, because the others just allow a None batch size, # but let's just play safe for now # For now, it should always look exactly like states # (n_batch, n_time_steps) # mask = T.imatrix('mask') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, num_channels, input_height, input_width), dtype=theano.config.floatX), name='states') self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, num_channels, input_height, input_width), dtype=theano.config.floatX), name='next_states') self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True), name='rewards') self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True), name='actions') self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # self.mask_shared = theano.shared(np.ones((batch_size, num_frames), # dtype='int32')) # lstmout = lasagne.layers.get_output(self.lstm, states / input_scale) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) # mask_input=mask) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale ) # mask_input=mask) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale ) # mask_input=mask) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(target.shape[0]), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) # print params givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': update_for = lambda params: deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': update_for = lambda params: lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': update_for = lambda params: lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) updates = update_for(params) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) # # Super mega shady stuff # # Somehow an update sneaks in for cell and hid. Kill it with fire if self.lstm: delete_keys = [k for k, v in updates.items() if k.name in ['cell', 'hid']] # print delete_keys for key in delete_keys: del updates[key] self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength, networkType = "conv", updateRule = "deepmind_rmsprop", batchAccumulator = "sum", clipDelta = 1.0, inputScale = 255.0): self.batchSize = batchSize self.numFrames = numFrames self.inputWidth = inputWidth self.inputHeight = inputHeight self.inputScale = inputScale self.numActions = numActions self.discountRate = discountRate self.learningRate = learningRate self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.networkUpdateDelay = networkUpdateDelay self.useSARSAUpdate = useSARSAUpdate self.kReturnLength = kReturnLength self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.clipDelta = clipDelta self.updateCounter = 0 states = T.tensor4("states") nextStates = T.tensor4("nextStates") rewards = T.col("rewards") actions = T.icol("actions") nextActions= T.icol("nextActions") terminals = T.icol("terminals") self.statesShared = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.nextStatesShared = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.rewardsShared = theano.shared(np.zeros((self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.qValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale) if self.networkUpdateDelay > 0: self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) self.resetNextQValueNetwork() nextQValues = lasagne.layers.get_output(self.nextQValueNetwork, nextStates / self.inputScale) else: nextQValues = lasagne.layers.get_output(self.qValueNetwork, nextStates / self.inputScale) nextQValues = theano.gradient.disconnected_grad(nextQValues) if self.useSARSAUpdate: target = rewards + terminals * (self.discountRate ** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape((-1,))].reshape((-1, 1)) else: target = rewards + terminals * (self.discountRate ** self.kReturnLength) * T.max(nextQValues, axis = 1, keepdims = True) targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1,))].reshape((-1, 1)) quadraticPart = T.minimum(abs(targetDifference), self.clipDelta) linearPart = abs(targetDifference) - quadraticPart # if self.clipDelta > 0: # targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta) if self.batchAccumulator == "sum": # loss = T.sum(targetDifference ** 2) loss = T.sum(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart) elif self.batchAccumulator == "mean": # loss = T.mean(targetDifference ** 2) loss = T.mean(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart) else: raise ValueError("Bad Network Accumulator. {sum, mean} expected") networkParameters = lasagne.layers.helper.get_all_params(self.qValueNetwork) if self.updateRule == "deepmind_rmsprop": updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "rmsprop": updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "sgd": updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate) else: raise ValueError("Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected") if self.momentum > 0: updates.lasagne.updates.apply_momentum(updates, None, self.momentum) lossGivens = { states: self.statesShared, nextStates: self.nextStatesShared, rewards:self.rewardsShared, actions: self.actionsShared, nextActions: self.nextActionsShared, terminals: self.terminalsShared } self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn') self.__computeQValues = theano.function([], qValues, givens={states: self.statesShared})
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0, double=False, transition_length=4): if double: print 'USING DOUBLE DQN' self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states_t') actions = T.icol('actions_t') target = T.col('evaluation_t') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.target_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.states_transition_shared = theano.shared( np.zeros((batch_size, transition_length * 2, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.states_one_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) """get Q(s) batch_size = 1 """ q1_givens = { states: self.states_one_shared.reshape( (1, self.num_frames, self.input_height, self.input_width)) } self._q1_vals = theano.function([], q_vals[0], givens=q1_givens) """get Q(s) batch_size = batch size """ q_batch_givens = { states: self.states_shared.reshape((self.batch_size, self.num_frames, self.input_height, self.input_width)) } self._q_batch_vals = theano.function([], q_vals, givens=q_batch_givens) action_mask = T.eq( T.arange(num_actions).reshape((1, -1)), actions.reshape( (-1, 1))).astype(theano.config.floatX) q_s_a = (q_vals * action_mask).sum(axis=1).reshape((-1, 1)) """ get Q(s,a) batch_size = batch size """ q_s_a_givens = { states: self.states_shared.reshape((self.batch_size, self.num_frames, self.input_height, self.input_width)), actions: self.actions_shared } self._q_s_a_vals = theano.function([], q_s_a, givens=q_s_a_givens) if self.freeze_interval > 0: q_target_vals = lasagne.layers.get_output(self.next_l_out, states / input_scale) else: q_target_vals = lasagne.layers.get_output(self.l_out, states / input_scale) q_target_vals = theano.gradient.disconnected_grad(q_target_vals) if not double: q_target = T.max(q_target_vals, axis=1) else: greedy_actions = T.argmax(q_vals, axis=1) q_target_mask = T.eq( T.arange(num_actions).reshape((1, -1)), greedy_actions.reshape((-1, 1)).astype(theano.config.floatX)) q_target = (q_target_vals * q_target_mask).sum(axis=1).reshape( (-1, 1)) """get Q target Q'(s,a') for a batch of transitions batch size = batch_size * transition length""" q_target_transition_givens = { states: self.states_transition_shared.reshape( (batch_size * transition_length * 2, self.num_frames, self.input_height, self.input_width)) } self._q_target = theano.function([], q_target.reshape( (batch_size, transition_length * 2)), givens=q_target_transition_givens) """get Q target_vals Q'(s) for a batch of transitions batch size = batch_size * transition length""" self._q_target_vals = theano.function( [], q_target_vals.reshape( (batch_size, transition_length * 2, num_actions)), givens=q_target_transition_givens) diff = q_s_a - target if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part else: loss = 0.5 * diff**2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) """Q(s,a) target train()""" train_givens = { states: self.states_shared, actions: self.actions_shared, target: self.target_shared } self._train = theano.function([], [loss], updates=updates, givens=train_givens, on_unused_input='warn') self._train2 = theano.function([], [loss], updates=updates, givens=train_givens, on_unused_input='warn')
def __init__(self, rng=None, Xd=None, \ g_net=None, i_net=None, pn_seq=None, \ data_dim=None, prior_dim=None, \ params=None): # setup a rng for this AEDPair self.rng = RandStream(rng.randint(100000)) if (params is None): self.params = {} else: self.params = params if 'match_type' in params: self.match_type = params['match_type'] else: self.match_type = 'grad_sign' # we can only try to match sign or direction... assert((self.match_type == 'grad_dir') or \ (self.match_type == 'grad_sign')) if self.match_type == 'grad_dir': # we match the direction of the gradient under the assumption # of gaussian observation noise self.mean_transform = lambda x: max_normalize(x, axis=1) assert(g_net.out_type == 'gaussian') else: # we match the sign of the gradient as if it were a collection # of independent binary variables self.mean_transform = lambda x: 2.0 * (x - 0.5) assert(g_net.out_type == 'bernoulli') # record the symbolic variables that will provide inputs to the # computation graph created to describe this AEDPair self.Xd = Xd self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq self.Xc = 0.0 * self.Xd self.Xm = 0.0 * self.Xd self.obs_count = T.cast(Xd.shape[0], 'floatX') # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=self.Xd, Xc=self.Xc, Xm=self.Xm) self.policy_mean = self.IN.output_mean self.policy_logvar = self.IN.output_logvar # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) # set up a var for controlling the max-norm bound on perturbations zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lam_mnb = theano.shared(value=zero_ary, \ name='adp_lam_mnb') self.set_lam_mnb(lam_mnb=0.1) # get the perturbations output by the generator network self.Pg = self.mean_transform(self.GN.output) if self.match_type == 'grad_dir': # samples because we're matching gradient via squared error self.Pg_samples = self.mean_transform(self.GN.output_samples) else: # no samples, because we're matching gradient sign self.Pg_samples = self.mean_transform(self.GN.output) # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # make a clone of the target PeaNetSeq that takes perturbed inputs self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \ seq_Xd=[self.Xd, self.Xd], seq_Yd=[self.Yd, self.Yd], \ no_funcs=True) self.grad_pea_Xd = T.grad(self.PNS.joint_cost, self.Xd) if self.match_type == 'grad_dir': # turn gradient into a unit max-normalized vector self.match_target = max_normalize(self.grad_pea_Xd) else: # transform gradient into binary indicators of sign self.match_target = (self.grad_pea_Xd > 0.0) # get the symbolic vars for passing inputs to self.PNS self.Xd_seq = self.PNS.Xd_seq self.Yd_seq = self.PNS.Yd_seq self.seq_inputs = self.Xd_seq + self.Yd_seq # shared var learning rate for generator and inferencer self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2') self.it_count = theano.shared(value=zero_ary, name='adp_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv') self.set_lam_adv(lam_adv=1.0) # init shared vars for weighting a penalty on the norms of our learned # policies and a reward to encourage maximizing their entropy. self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld') self.set_lam_kld(lam_kld=0.1) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w') self.set_lam_l2w(1e-4) # Grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.adv_cost = self.lam_adv[0] * self._construct_adv_cost() self.kld_cost = self.lam_kld[0] * self._construct_kld_cost() self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.adv_cost + self.kld_cost + \ self.other_reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.1, 0.1) # Construct the updates for the generator and inferencer networks self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # Construct a function for jointly training the generator/inferencer self.train_joint = self._construct_train_joint() # Construct a function for computing the outputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_from_gn = self.GN.sample_from_model self.sample_from_Xd = self._construct_sample_from_Xd() return
def __init__(self, num_actions): # remember parameters self.num_actions = num_actions self.batch_size = BATCH_SIZE self.discount_rate = DISCOUNT_RATE self.history_length = HISTORY_LENGTH self.screen_dim = DIMS self.img_height = SCREEN_HEIGHT self.img_width = SCREEN_WIDTH self.clip_error = CLIP_ERROR self.input_color_scale = COLOR_SCALE self.target_steps = TARGET_STEPS self.train_iterations = TRAIN_STEPS self.train_counter = 0 self.momentum = MOMENTUM self.update_rule = UPDATE_RULE self.learning_rate = LEARNING_RATE self.rms_decay = RMS_DECAY self.rms_epsilon = RMS_EPSILON self.rng = np.random.RandomState(RANDOM_SEED) # set seed lasagne.random.set_rng(self.rng) # prepare tensors once and reuse them states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') # terminals are bool for our case terminals = T.bcol('terminals') # create shared theano variables self.states_shared = theano.shared( np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width), dtype=theano.config.floatX)) # !broadcast ? self.rewards_shared = theano.shared( np.zeros((self.batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((self.batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( #np.zeros((self.batch_size, 1), dtype='int32'), np.zeros((self.batch_size, 1), dtype='int8'), broadcastable=(False, True)) # can add multiple nets here self.l_primary = self.build_network() if self.target_steps > 0: self.l_secondary = self.build_network() self.copy_to_secondary() """ # input scale i.e. division can be applied to input directly also to normalize """ # define output symbols q_vals = lasagne.layers.get_output(self.l_primary, states / self.input_color_scale) if self.target_steps > 0: q_vals_secondary = lasagne.layers.get_output(self.l_secondary, next_states / self.input_color_scale) else: # why this ? q_vals_secondary = lasagne.layers.get_output(self.l_primary, next_states / self.input_color_scale) q_vals_secondary = theano.gradient.disconnected_grad(q_vals_secondary) # target = r + max target = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(q_vals_secondary, axis=1, keepdims=True)) """ # check what this does """ diff = target - q_vals[T.arange(self.batch_size), actions.reshape((-1,))].reshape((-1, 1)) # print shape ? if self.clip_error > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_error) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_error * linear_part else: loss = 0.5 * diff ** 2 loss = T.sum(loss) params = lasagne.layers.helper.get_all_params(self.l_primary) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } g_time = time.time() logger.info("graph compiling") if self.update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.learning_rate, self.rms_decay, self.rms_epsilon) elif self.update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.learning_rate, self.rms_decay, self.rms_epsilon) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared}) logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.gamma = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') #terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # self.terminals_shared = theano.shared( # np.zeros((batch_size, 1), dtype='int32'), # broadcastable=(False,True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = rewards + self.gamma * T.max( next_q_vals, axis=1, keepdims=True) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) if batch_accumulator == 'sum': loss = T.sum(diff**2) elif batch_accumulator == 'mean': loss = T.mean(diff**2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, #terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, rng=None, Xd=None, \ g_net=None, i_net=None, pn_seq=None, \ data_dim=None, prior_dim=None, \ params=None): # setup a rng for this ADPair self.rng = RandStream(rng.randint(100000)) if (params is None): self.params = {} else: self.params = params if 'mean_transform' in self.params: # apply a user-defined transform to the GenNet output prior to # rescaling by self.lam_mnb... self.mean_transform = self.params['mean_transform'] else: # default transform is sigmoid -> shift -> scale so that # perturbations (for each dimension) are in range -1 --> 1. self.mean_transform = lambda x: 2.0 * (apply_sigmoid(x) - 0.5) # record the symbolic variables that will provide inputs to the # computation graph created to describe this ADPair self.Xd = Xd self.Yd = T.icol('adp_Yd') # labels to pass to the PeaNetSeq self.Xc = 0.0 * self.Xd self.Xm = 0.0 * self.Xd self.obs_count = T.cast(Xd.shape[0], 'floatX') # create a "shared-parameter" clone of the inferencer, set up to # receive input from the appropriate symbolic variables. self.IN = i_net.shared_param_clone(rng=rng, \ Xd=self.Xd, Xc=self.Xc, Xm=self.Xm) # capture a handle for samples from the variational posterior self.Xp = self.IN.output # create a "shared-parameter" clone of the generator, set up to # receive input from samples from the variational posterior self.GN = g_net.shared_param_clone(rng=rng, Xp=self.IN.output) assert(self.GN.out_type == 'gaussian') # check for right output # set up a var for controlling the max-norm bound on perturbations zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.lam_mnb = theano.shared(value=zero_ary, \ name='adp_lam_mnb') self.set_lam_mnb(lam_mnb=0.1) # rescale the perturbations, to make them adjustably norm-bounded self.Xg = self.lam_mnb[0] * self.mean_transform(self.GN.output_mean) # record and validate the data dimensionality parameters self.data_dim = data_dim self.prior_dim = prior_dim # output of the generator and input to the inferencer should both be # equal to self.data_dim assert(self.data_dim == self.GN.mlp_layers[-1].out_dim) assert(self.data_dim == self.IN.shared_layers[0].in_dim) # input of the generator and mu/sigma outputs of the inferencer should # both be equal to self.prior_dim assert(self.prior_dim == self.GN.mlp_layers[0].in_dim) assert(self.prior_dim == self.IN.mu_layers[-1].out_dim) assert(self.prior_dim == self.IN.sigma_layers[-1].out_dim) # make a clone of the target PeaNetSeq that takes perturbed inputs self.PNS = pn_seq.shared_param_clone(rng=rng, seq_len=2, \ seq_Xd=[self.Xd, (self.Xd + self.Xg)]) # get the symbolic vars for passing inputs to self.PNS self.Xd_seq = self.PNS.Xd_seq self.Yd_seq = self.PNS.Yd_seq self.seq_inputs = self.Xd_seq + self.Yd_seq # shared var learning rate for generator and inferencer self.lr_gn = theano.shared(value=zero_ary, name='adp_lr_gn') self.lr_in = theano.shared(value=zero_ary, name='adp_lr_in') # shared var momentum parameters for generator and inferencer self.mom_1 = theano.shared(value=zero_ary, name='adp_mom_1') self.mom_2 = theano.shared(value=zero_ary, name='adp_mom_2') self.it_count = theano.shared(value=zero_ary, name='adp_it_count') # init parameters for controlling learning dynamics self.set_all_sgd_params() # init shared var for weighting nll of data given posterior sample self.lam_adv = theano.shared(value=zero_ary, name='adp_lam_adv') self.set_lam_adv(lam_adv=1.0) # init shared var for weighting Gaussian prior over the policy self.lam_kld = theano.shared(value=zero_ary, name='adp_lam_kld') self.set_lam_kld(lam_kld=1.0) # init shared var for controlling l2 regularization on params self.lam_l2w = theano.shared(value=zero_ary, name='adp_lam_l2w') self.set_lam_l2w(1e-4) # Grab the full set of "optimizable" parameters from the generator # and inferencer networks that we'll be working with. self.in_params = [p for p in self.IN.mlp_params] self.gn_params = [p for p in self.GN.mlp_params] self.joint_params = self.in_params + self.gn_params ################################### # CONSTRUCT THE COSTS TO OPTIMIZE # ################################### self.adv_cost = self.lam_adv[0] * self._construct_adv_cost() self.kld_cost = self.lam_kld[0] * self._construct_kld_cost() self.other_reg_cost = self._construct_other_reg_cost() self.joint_cost = self.adv_cost + self.kld_cost + \ self.other_reg_cost # Get the gradient of the joint cost for all optimizable parameters self.joint_grads = OrderedDict() for p in self.joint_params: self.joint_grads[p] = T.grad(self.joint_cost, p).clip(-0.05, 0.05) # Construct the updates for the generator and inferencer networks self.gn_updates = get_adam_updates(params=self.gn_params, \ grads=self.joint_grads, alpha=self.lr_gn, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.in_updates = get_adam_updates(params=self.in_params, \ grads=self.joint_grads, alpha=self.lr_in, \ beta1=self.mom_1, beta2=self.mom_2, it_count=self.it_count, \ mom2_init=1e-3, smoothing=1e-8) self.joint_updates = OrderedDict() for k in self.gn_updates: self.joint_updates[k] = self.gn_updates[k] for k in self.in_updates: self.joint_updates[k] = self.in_updates[k] # Construct a function for jointly training the generator/inferencer self.train_joint = self._construct_train_joint() # Construct a function for computing the outputs of the generator # network for a batch of noise. Presumably, the noise will be drawn # from the same distribution that was used in training.... self.sample_from_gn = self.GN.sample_from_model self.sample_from_Xd = self._construct_sample_from_Xd() return
def __init__(self, environment, rho=0.9, rms_epsilon=0.0001, momentum=0, clip_delta=0, freeze_interval=1000, batch_size=32, network_type=None, update_rule="rmsprop", batch_accumulator="sum", random_state=np.random.RandomState(), double_Q=False, neural_network=NN): """ Initialize environment """ QNetwork.__init__(self,environment, batch_size) self._rho = rho self._rms_epsilon = rms_epsilon self._momentum = momentum self._clip_delta = clip_delta self._freeze_interval = freeze_interval self._double_Q = double_Q self._random_state = random_state self.update_counter = 0 states=[] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states=[] # idem than states at t+1 self.states_shared=[] # list of shared variable for each of the k element in the belief state self.next_states_shared=[] # idem that self.states_shared at t+1 for i, dim in enumerate(self._input_dimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append( T.matrix("%s_%s" % ("state", i)) ) next_states.append( T.matrix("%s_%s" % ("next_state", i)) ) self.states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False)) self.next_states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False)) print("Number of observations per state: {}".format(len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._input_dimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) Q_net=neural_network(self._batch_size, self._input_dimensions, self._n_actions, self._random_state) self.q_vals, self.params, shape_after_conv = Q_net._buildDQN(states) print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv)) self.next_q_vals, self.next_params, shape_after_conv = Q_net._buildDQN(next_states) self._resetQHat() self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) if(self._double_Q==True): givens_next={} for i, x in enumerate(self.next_states_shared): givens_next[ states[i] ] = x self.next_q_vals_current_qnet=theano.function([], self.q_vals, givens=givens_next) next_q_curr_qnet = theano.clone(self.next_q_vals) argmax_next_q_vals=T.argmax(next_q_curr_qnet, axis=1, keepdims=True) max_next_q_vals=self.next_q_vals[T.arange(batch_size),argmax_next_q_vals.reshape((-1,))].reshape((-1, 1)) else: max_next_q_vals=T.max(self.next_q_vals, axis=1, keepdims=True) not_terminals=T.ones_like(terminals) - terminals target = rewards + not_terminals * thediscount * max_next_q_vals q_val=self.q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32... diff = - q_val + target if self._clip_delta > 0: # This loss function implementation is taken from # https://github.com/spragunr/deep_q_rl # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self._clip_delta) linear_part = abs(diff) - quadratic_part loss_ind = 0.5 * quadratic_part ** 2 + self._clip_delta * linear_part else: loss_ind = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss_ind) elif batch_accumulator == 'mean': loss = T.mean(loss_ind) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[ states[i] ] = x for i, x in enumerate(self.next_states_shared): givens[ next_states[i] ] = x gparams=[] for p in self.params: gparam = T.grad(loss, p) gparams.append(gparam) updates = [] if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self._rho, self._rms_epsilon) elif update_rule == 'rmsprop': for i,(p, g) in enumerate(zip(self.params, gparams)): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - self._rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + self._rms_epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - thelr * g)) elif update_rule == 'sgd': for i, (param, gparam) in enumerate(zip(self.params, gparams)): updates.append((param, param - thelr * gparam)) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if(self._double_Q==True): self._train = theano.function([thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') else: self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2={} for i, x in enumerate(self.states_shared): givens2[ states[i] ] = x self._q_vals = theano.function([], self.q_vals, givens=givens2, on_unused_input='warn')
def __init__(self, num_actions, phi_length, width, height, discount=.9, learning_rate=.01, batch_size=32, approximator='none'): self._batch_size = batch_size self._num_input_features = phi_length self._phi_length = phi_length self._img_width = width self._img_height = height self._discount = discount self.num_actions = num_actions self.learning_rate = learning_rate self.scale_input_by = 255.0 print "neural net initialization, lr is: ", self.learning_rate, approximator # CONSTRUCT THE LAYERS self.q_layers = [] self.q_layers.append( layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_height, self._img_width, self.scale_input_by)) if approximator == 'cuda_conv': self.q_layers.append( cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1])) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=16, filter_size=8, stride=4, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1], n_filters=32, filter_size=4, stride=2, weights_std=.01, init_bias_value=0.1)) self.q_layers.append( cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1])) elif approximator == 'conv': self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=16, filter_width=8, filter_height=8, stride_x=4, stride_y=4, weights_std=.01, init_bias_value=0.01)) self.q_layers.append( layers.StridedConv2DLayer(self.q_layers[-1], n_filters=32, filter_width=4, filter_height=4, stride_x=2, stride_y=2, weights_std=.01, init_bias_value=0.01)) if approximator == 'cuda_conv' or approximator == 'conv': self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=256, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.rectify)) self.q_layers.append( layers.DenseLayer(self.q_layers[-1], n_outputs=num_actions, weights_std=0.01, init_bias_value=0.1, dropout=0, nonlinearity=layers.identity)) if approximator == 'none': self.q_layers.append(\ layers.DenseLayerNoBias(self.q_layers[-1], n_outputs=num_actions, weights_std=0.00, dropout=0, nonlinearity=layers.identity)) self.q_layers.append(layers.OutputLayer(self.q_layers[-1])) for i in range(len(self.q_layers) - 1): print self.q_layers[i].get_output_shape() # Now create a network (using the same weights) # for next state q values self.next_layers = copy_layers(self.q_layers) self.next_layers[0] = layers.Input2DLayer(self._batch_size, self._num_input_features, self._img_width, self._img_height, self.scale_input_by) self.next_layers[1].input_layer = self.next_layers[0] self.rewards = T.col() self.actions = T.icol() # Build the loss function ... print "building loss funtion" q_vals = self.q_layers[-1].predictions() next_q_vals = self.next_layers[-1].predictions() next_maxes = T.max(next_q_vals, axis=1, keepdims=True) target = self.rewards + discount * next_maxes target = theano.gradient.consider_constant(target) diff = target - q_vals # Zero out all entries for actions that were not chosen... mask = build_mask(T.zeros_like(diff), self.actions, 1.0) diff_masked = diff * mask error = T.mean(diff_masked**2) self._loss = error * diff_masked.shape[1] # self._parameters = layers.all_parameters(self.q_layers[-1]) self._idx = T.lscalar('idx') # CREATE VARIABLES FOR INPUT AND OUTPUT self.states_shared = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.states_shared_next = theano.shared( np.zeros((1, 1, 1, 1), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (1, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'), broadcastable=(False, True)) self._givens = \ {self.q_layers[0].input_var: self.states_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.next_layers[0].input_var: self.states_shared_next[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :, :, :], self.rewards: self.rewards_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :], self.actions: self.actions_shared[self._idx*self._batch_size: (self._idx+1)*self._batch_size, :] } self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\ self._loss, self._parameters, learning_rate=self.learning_rate, rho=0.9, momentum=0.9, epsilon=1e-6) self._train = theano.function([self._idx], self._loss, givens=self._givens, updates=self._updates) self._compute_loss = theano.function([self._idx], self._loss, givens=self._givens) self._compute_q_vals = \ theano.function([self.q_layers[0].input_var], self.q_layers[-1].predictions(), on_unused_input='ignore')
def manifold_walk_regularization(): for t_num in range(10): out_file = open("MWR_TEST_RESULTS_{0:d}.txt".format(t_num), 'wb') # Initialize a source of randomness rng = np.random.RandomState(t_num) # Load some data to train/validate/test with sup_count = 600 dataset = 'data/mnist.pkl.gz' datasets = load_udm_ss(dataset, sup_count, rng, zero_mean=False) Xtr_su = datasets[0][0].get_value(borrow=False) Ytr_su = datasets[0][1].get_value(borrow=False).astype(np.int32) Xtr_un = datasets[1][0].get_value(borrow=False) Ytr_un = datasets[1][1].get_value(borrow=False).astype(np.int32) # get the joint labeled and unlabeled data Xtr_un = np.vstack([Xtr_su, Xtr_un]).astype(theano.config.floatX) Ytr_un = np.vstack([Ytr_su[:,np.newaxis], Ytr_un[:,np.newaxis]]) Ytr_un = 0 * Ytr_un # KEEP CATS FIXED OR FREE? YES/NO? Xtr_mean = np.mean(Xtr_un, axis=0, keepdims=True) # get the labeled data Xtr_su = Xtr_su.astype(theano.config.floatX) Ytr_su = Ytr_su[:,np.newaxis] # get observations and labels for the validation set Xva = datasets[2][0].get_value(borrow=False).astype(theano.config.floatX) Yva = datasets[2][1].get_value(borrow=False).astype(np.int32) Yva = Yva[:,np.newaxis] # numpy is dumb # get observations and labels for the test set Xte = datasets[3][0].get_value(borrow=False).astype(theano.config.floatX) Yte = datasets[3][1].get_value(borrow=False).astype(np.int32) Yte = Yte[:,np.newaxis] # numpy is dumb # get size information for the data and training batches un_samples = Xtr_un.shape[0] su_samples = Xtr_su.shape[0] va_samples = Xva.shape[0] data_dim = Xtr_su.shape[1] label_dim = 10 batch_size = 100 # Symbolic inputs Xd = T.matrix(name='Xd') Xc = T.matrix(name='Xc') Xm = T.matrix(name='Xm') Xt = T.matrix(name='Xt') Xp = T.matrix(name='Xp') Yd = T.icol('Yd') # Load inferencer and generator from saved parameters gn_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_GN.pkl" in_fname = "MNIST_WALKOUT_TEST_BIN/pt_walk_params_b150000_IN.pkl" IN = INet.load_infnet_from_file(f_name=in_fname, rng=rng, Xd=Xd) GN = GNet.load_gennet_from_file(f_name=gn_fname, rng=rng, Xp=Xp) IN.set_sigma_scale(1.3) prior_dim = GN.latent_dim MCS = MCSampler(rng=rng, Xd=Xd, i_net=IN, g_net=GN, chain_len=2, \ data_dim=data_dim, prior_dim=prior_dim) full_chain_len = MCS.chain_len + 1 # setup "chain" versions of the labeled/unlabeled/validate sets Xtr_su_chains = [Xtr_su.copy() for i in range(full_chain_len)] Xtr_un_chains = [Xtr_un.copy() for i in range(full_chain_len)] Ytr_su_chains = [Ytr_su for i in range(full_chain_len)] Ytr_un_chains = [Ytr_un for i in range(full_chain_len)] Xva_chains = [Xva for i in range(full_chain_len)] Yva_chains = [Yva for i in range(full_chain_len)] # downsample, to feed less into the PNS Xtr_su_short = downsample_chains(Xtr_su_chains, stride=1) Xtr_un_short = downsample_chains(Xtr_un_chains, stride=1) Ytr_su_short = downsample_chains(Ytr_su_chains, stride=1) Ytr_un_short = downsample_chains(Ytr_un_chains, stride=1) Xva_short = downsample_chains(Xva_chains, stride=1) Yva_short = downsample_chains(Yva_chains, stride=1) short_chain_len = len(Xtr_su_short) print("REGULARIZATION CHAIN STEPS: {0:d}".format(short_chain_len)) # choose some parameters for the categorical inferencer pn_params = {} pc0 = [data_dim, 800, 800, label_dim] pn_params['proto_configs'] = [pc0] # Set up some spawn networks sc0 = {'proto_key': 0, 'input_noise': 0.1, 'bias_noise': 0.1, 'do_dropout': True} pn_params['spawn_configs'] = [ sc0 ] pn_params['spawn_weights'] = [ 1.0 ] # Set remaining params pn_params['activation'] = relu_actfun pn_params['init_scale'] = 0.5 pn_params['lam_l2a'] = 1e-3 pn_params['vis_drop'] = 0.2 pn_params['hid_drop'] = 0.5 # Initialize the base network for this PNSeq PN = PeaNet(rng=rng, Xd=Xd, params=pn_params) PN.init_biases(0.1) print("Initializing PNS...") # Initialize the PeaNetSeq PNS = PeaNetSeq(rng=rng, pea_net=PN, seq_len=short_chain_len, \ seq_Xd=None, params=None) # set weighting parameters for the various costs... PNS.set_lam_class(1.0) PNS.set_lam_pea_su(0.0) PNS.set_lam_pea_un(2.0) PNS.set_lam_ent(0.0) PNS.set_lam_l2w(1e-5) learn_rate = 0.05 PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999) for i in range(300000): if i < 5000: scale = float(i + 1) / 5000.0 if ((i+1 % 100000) == 0): learn_rate = learn_rate * 0.5 if ((i % 250) == 0): Xtr_su_chains = resample_chain_steps(MCS, Xtr_su_chains) Xtr_un_chains = resample_chain_steps(MCS, Xtr_un_chains) Xtr_su_short = downsample_chains(Xtr_su_chains, stride=1) Xtr_un_short = downsample_chains(Xtr_un_chains, stride=1) # get some data to train with su_idx = npr.randint(low=0,high=su_samples,size=(batch_size,)) xsuc = [(x.take(su_idx, axis=0) - Xtr_mean) for x in Xtr_su_short] ysuc = [y.take(su_idx, axis=0) for y in Ytr_su_short] un_idx = npr.randint(low=0,high=un_samples,size=(batch_size,)) xunc = [(x.take(un_idx, axis=0) - Xtr_mean) for x in Xtr_un_short] yunc = [y.take(un_idx, axis=0) for y in Ytr_un_short] Xb_chains = [np.vstack((xsu, xun)) for (xsu, xun) in zip(xsuc, xunc)] Yb_chains = [np.vstack((ysu, yun)) for (ysu, yun) in zip(ysuc, yunc)] # set learning parameters for this update PNS.set_pn_sgd_params(lr_pn=learn_rate, mom_1=0.9, mom_2=0.999) # do a minibatch update of all PeaNet parameters outputs = PNS.train_joint(*(Xb_chains + Yb_chains)) joint_cost = 1.0 * outputs[0] class_cost = 1.0 * outputs[1] pea_cost = 1.0 * outputs[2] ent_cost = 1.0 * outputs[3] other_reg_cost = 1.0 * outputs[4] assert(not (np.isnan(joint_cost))) if ((i % 500) == 0): o_str = "batch: {0:d}, joint: {1:.4f}, class: {2:.4f}, pea: {3:.4f}, ent: {4:.4f}, other_reg: {5:.4f}".format( \ i, joint_cost, class_cost, pea_cost, ent_cost, other_reg_cost) print(o_str) out_file.write(o_str+"\n") out_file.flush() # check classification error on training and validation set train_err = PNS.classification_error(Xtr_su-Xtr_mean, Ytr_su) va_err = PNS.classification_error(Xva-Xtr_mean, Yva) o_str = " tr_err: {0:.4f}, va_err: {1:.4f}".format(train_err, va_err) print(o_str) out_file.write(o_str+"\n") out_file.flush() if ((i % 1000) == 0): # draw the main PeaNet's first-layer filters/weights file_name = "MWR_PN_WEIGHTS.png".format(i) utils.visualize_net_layer(PNS.PN.proto_nets[0][0], file_name) print("TESTING COMPLETE!")
def __init__(self, input, n_in, n_out): hidden_size = 36 batch_size = 32 self._w_h = init_weights((n_in, hidden_size)) self._b_h = init_b_weights((1, hidden_size)) # self._b_h = init_b_weights((hidden_size,)) self._w_h2 = init_weights((hidden_size, hidden_size)) self._b_h2 = init_b_weights((1, hidden_size)) # self._b_h2 = init_b_weights((hidden_size,)) # self._w_o = init_tanh(hidden_size, n_out) self._w_o = init_weights((hidden_size, n_out)) self._b_o = init_b_weights((1, n_out)) # self._b_o = init_b_weights((n_out,)) self.updateTargetModel() self._w_h_old = init_weights((n_in, hidden_size)) self._w_h2_old = init_weights((hidden_size, hidden_size)) self._w_o_old = init_tanh(hidden_size, n_out) # print ("Initial W " + str(self._w_o.get_value()) ) self._learning_rate = 0.00025 self._discount_factor = 0.99 self._weight_update_steps = 5000 self._updates = 0 # data types for model State = T.dmatrix("State") State.tag.test_value = np.random.rand(batch_size, 2) ResultState = T.dmatrix("ResultState") ResultState.tag.test_value = np.random.rand(batch_size, 2) Reward = T.col("Reward") Reward.tag.test_value = np.random.rand(batch_size, 1) Action = T.icol("Action") Action.tag.test_value = np.zeros((batch_size, 1), dtype=np.dtype('int32')) # Q_val = T.fmatrix() # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1))) # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True) py_x = self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0) y_pred = T.argmax(py_x, axis=1) q_func = T.mean((self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.0, 0.0))[T.arange(batch_size), Action.reshape((-1, ))].reshape( (-1, 1))) # q_val = py_x # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5) # L1 norm ; one regularization option is to enforce L1 norm to # be small self._L1 = (abs(self._w_h).sum() + abs(self._w_h2).sum() + abs(self._w_o).sum()) self._L1_reg = 0.0 self._L2_reg = 0.001 # L2 norm ; one regularization option is to enforce # L2 norm to be small self._L2 = ((self._w_h**2).sum() + (self._w_h2**2).sum() + (self._w_o**2).sum()) # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State)) delta = ((Reward + (self._discount_factor * T.max(self.model( ResultState, self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5), axis=1, keepdims=True))) - (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5))[T.arange(Action.shape[0]), Action.reshape((-1, ))].reshape((-1, 1))) # bellman_cost = T.mean( 0.5 * ((delta) ** 2 )) bellman_cost = T.mean(0.5 * ((delta)**2)) + ( self._L2_reg * self._L2) + (self._L1_reg * self._L1) params = [ self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o ] # updates = sgd(bellman_cost, params, lr=self._learning_rate) # updates = rlTDSGD(q_func, T.mean(delta), params, lr=self._learning_rate) # updates = RMSprop(bellman_cost, params, lr=self._learning_rate) # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate) # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01) updates = lasagne.updates.rmsprop(q_func, params, self._learning_rate * -T.mean(delta), 0.95, 0.01) self._train = theano.function( inputs=[State, Action, Reward, ResultState], outputs=bellman_cost, updates=updates, allow_input_downcast=True) self._predict = theano.function(inputs=[State], outputs=y_pred, allow_input_downcast=True) self._q_values = theano.function(inputs=[State], outputs=py_x, allow_input_downcast=True) self._bellman_error = theano.function( inputs=[State, Action, Reward, ResultState], outputs=delta, allow_input_downcast=True)
def __init__(self, model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_): super(CACLA2, self).__init__(model, n_in, n_out, state_bounds, action_bounds, reward_bound, settings_) # create a small convolutional neural network self._Fallen = T.icol("Action") self._Fallen.tag.test_value = np.zeros((self._batch_size, 1), dtype=np.dtype('int32')) self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='int32'), broadcastable=(False, True)) self._target_shared = theano.shared(np.zeros((self._batch_size, 1), dtype='float64'), broadcastable=(False, True)) self._critic_regularization_weight = self.getSettings( )["critic_regularization_weight"] self._critic_learning_rate = self.getSettings()["critic_learning_rate"] # primary network self._model = model # Target network # self._modelTarget = copy.deepcopy(model) self._q_valsA = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) self._q_valsA_drop = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsTargetNextState = lasagne.layers.get_output( self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable()) # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable()) # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_valsActA = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=True) # self._q_valsActTarget = lasagne.layers.get_output(self._modelTarget.getActorNetwork(), self._model.getStateSymbolicVariable()) self._q_valsActA_drop = lasagne.layers.get_output( self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False) self._q_func = self._q_valsA # self._q_funcTarget = self._q_valsTarget self._q_func_drop = self._q_valsA_drop # self._q_funcTarget_drop = self._q_valsTarget_drop self._q_funcAct = self._q_valsActA self._q_funcAct_drop = self._q_valsActA_drop # self._q_funcAct = theano.function(inputs=[self._model.getStateSymbolicVariable()], outputs=self._q_valsActA, allow_input_downcast=True) # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.maximum(1.0, theano.tensor.ceil(self._model.getRewardSymbolicVariable())) # Did not understand how the maximum was working # self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState )) * theano.tensor.ceil(self._model.getRewardSymbolicVariable()) self._target = (self._model.getRewardSymbolicVariable() + (self._discount_factor * self._q_valsTargetNextState)) * self._Fallen # self._target = self._model.getTargetSymbolicVariable() self._diff = self._target_shared - self._q_func self._diff_drop = self._target_shared - self._q_func_drop loss = 0.5 * self._diff**2 self._loss = T.mean(loss) self._loss_drop = T.mean(0.5 * self._diff_drop**2) self._params = lasagne.layers.helper.get_all_params( self._model.getCriticNetwork()) self._actionParams = lasagne.layers.helper.get_all_params( self._model.getActorNetwork()) self._givens_ = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._Fallen: self._fallen_shared # self._model.getActionSymbolicVariable(): self._actions_shared, } self._actGivens = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._next_states_shared, # self._model.getRewardSymbolicVariable(): self._rewards_shared, self._model.getActionSymbolicVariable(): self._model.getActions() } self._critic_regularization = ( self._critic_regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)) self._actor_regularization = ( self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getActorNetwork(), lasagne.regularization.l2)) # SGD update self._updates_ = lasagne.updates.rmsprop( self._loss + (self._regularization_weight * lasagne.regularization.regularize_network_params( self._model.getCriticNetwork(), lasagne.regularization.l2)), self._params, self._learning_rate, self._rho, self._rms_epsilon) # TD update # self._updates_ = lasagne.updates.rmsprop(T.mean(self._q_func) + self._critic_regularization, self._params, # self._critic_learning_rate * -T.mean(self._diff), self._rho, self._rms_epsilon) # actDiff1 = (self._model.getActionSymbolicVariable() - self._q_valsActTarget) #TODO is this correct? # actDiff = (actDiff1 - (self._model.getActionSymbolicVariable() - self._q_valsActA)) self._actDiff = ( (self._model.getActionSymbolicVariable() - self._q_valsActA) ) # Target network does not work well here? self._actDiff_drop = ( (self._model.getActionSymbolicVariable() - self._q_valsActA_drop) ) # Target network does not work well here? self._actLoss = 0.5 * self._actDiff**2 self._actLoss = T.sum(self._actLoss) / float(self._batch_size) self._actLoss_drop = (T.sum(0.5 * self._actDiff_drop**2) / float(self._batch_size) ) # because the number of rows can shrink self._actionUpdates = lasagne.updates.rmsprop( self._actLoss + self._actor_regularization, self._actionParams, self._learning_rate, self._rho, self._rms_epsilon) # actionUpdates = lasagne.updates.rmsprop(T.mean(self._q_funcAct_drop) + # (self._regularization_weight * lasagne.regularization.regularize_network_params( # self._model.getActorNetwork(), lasagne.regularization.l2)), actionParams, # self._learning_rate * 0.5 * (-T.sum(actDiff_drop)/float(self._batch_size)), self._rho, self._rms_epsilon) self._givens_grad = { self._model.getStateSymbolicVariable(): self._model.getStates(), # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(), # self._model.getRewardSymbolicVariable(): self._model.getRewards(), # self._model.getActionSymbolicVariable(): self._actions_shared, } CACLA2.compile(self)
def __init__(self, input_size, output_size, build_network=simple_network2, discount=0.99, learningRate=0.001, frozen_network_update_time=1000): print "Initializing new Q network" self.input_size = input_size self.output_size = output_size self.discount = discount self.learningRate = learningRate self.frozen_network_update_time = frozen_network_update_time self.frozen_timer = 0 self.epoch = 0 # logging variables self.log = { "batchMeanQValue": [], "batchMeanTargetQValue": [], "cost": [], 'performance': [], 'epoch': [] } # symbolic inputs sym_state = T.tensor4('state') #Batchsize, channels, X, Y sym_action = T.icol('action') sym_reward = T.col('reward') sym_isDone = T.bcol('isDone') sym_nextState = T.tensor4('nextState') # networks self.network = build_network(input_size, output_size) self.frozen_network = build_network(input_size, output_size) self.update_frozen_network() # forward pass print "Compiling forward passes" self.forward_pass = theano.function([sym_state], lasagne.layers.get_output( self.network, sym_state, deterministic=True)) self.frozen_forward_pass = theano.function([sym_state], lasagne.layers.get_output( self.frozen_network, sym_state, deterministic=True)) #clipped_reward = T.clip(sym_reward,-1,1) #cost function definition cost, error, q_action, q_target = self.build_cost_function( sym_state, sym_action, sym_reward, sym_isDone, sym_nextState) params = lasagne.layers.get_all_params(self.network, trainable=True) update_function = lasagne.updates.rmsprop( cost, params, learning_rate=self.learningRate) # training function print "Compiling training function" self._train = theano.function( [sym_state, sym_action, sym_reward, sym_isDone, sym_nextState], [cost, error, q_action, q_target], updates=update_function)