def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, update_rule, batch_accumulator, state_count, input_scale=255.0): self.state_count=state_count self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.matrix('states') next_states = T.matrix('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') #buferis inputu viso batch self.states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #buferis i koki state patenka visiem self.next_states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu? self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) #po 1 priimta action kiekvienam episode self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #?? turbut 0 ir 1, ar paskutine verte ar ne self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) #neaisku if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff ** 2) elif batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) # params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'adam': updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho, self.rms_epsilon) elif update_rule == 'adagrad': updates = lasagne.updates.adagrad(loss, params, self.lr, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) elif update_rule == 'momentum': updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, stateSize, actionSize, numFrames, batchSize, discount, rho, momentum, learningRate, rmsEpsilon, rng, updateRule, batchAccumulator, freezeInterval): self.stateSize = stateSize self.actionSize = actionSize self.numFrames = numFrames self.batchSize = batchSize self.discount = discount self.rho = rho self.momentum = momentum self.learningRate = learningRate self.rmsEpsilon = rmsEpsilon self.rng = rng self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.freezeInterval = freezeInterval lasagne.random.set_rng(self.rng) self.updateCounter = 0 self.lOut = self.buildNetwork(self.stateSize, self.actionSize, self.numFrames, self.batchSize) if self.freezeInterval > 0: self.nextLOut = self.buildNetwork(self.stateSize, self.actionSize, self.numFrames, self.batchSize) self.resetQHat() states = T.ftensor3('states') nextStates = T.ftensor3('nextStates') rewards = T.fcol('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Shared variables for teaching from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) states, along with the chosen action and resulting # reward and termninal status. self.states_shared = theano.shared( numpy.zeros((self.batchSize, self.numFrames + 1, self.stateSize), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(numpy.zeros( (self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(numpy.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared(numpy.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate qVals self.state_shared = theano.shared( numpy.zeros((self.numFrames, self.stateSize), dtype=theano.config.floatX)) qVals = lasagne.layers.get_output(self.lOut, states) if self.freezeInterval > 0: nextQVals = lasagne.layers.get_output(self.nextLOut, nextStates) else: nextQVals = lasagne.layers.get_output(self.lOut, nextStates) nextQVals = theano.gradient.disconnected_grad(nextQVals) # Cast terminals to floatX terminalsX = terminals.astype(theano.config.floatX) # T.eq(a,b) returns a variable representing the nogical # EQuality (a==b) actionmask = T.eq( T.arange(self.actionSize).reshape((1, -1)), actions.reshape( (-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(nextQVals, axis=1, keepdims=True)) output = (qVals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output # no if clip delta, since clip-delta=0 loss = (diff**2) if self.batchAccumulator == 'sum': loss = T.sum(loss) elif self.batchAccumulator == 'mean': loss = T.mean(loss) else: raise ValueError('Bad accumulator: {}'.format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.lOut) train_givens = { states: self.states_shared[:, :-1], nextStates: self.states_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if self.updateRule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.learningRate, self.rho, self.rmsEpsilon) elif self.updateRule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.learningRate, self.rho, self.rmsEpsilon) else: raise ValueError('Unrecognized update: {}'.format(updateRule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape( (1, self.numFrames, self.stateSize)) } # self._q_vals=theano.function([],qVals[0], givens=q_givens) self._q_vals = theano.function([], qVals[0], givens=q_givens)
target = reward + self.discount * next_q_val[next_action] diff = target - q_vals[action] loss = 0.5 * diff ** 2 params = lasagne.layers.helper.get_all_params(self.l_out) givens = { state: self.state_shared, action: self.action_shared, reward: self.reward_shared, next_state: self.next_state_shared, next_action: self.next_action_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={state: self.state_shared}) def build_network(self, network_type, input_width, input_height, output_dim, num_frames): if network_type == "large":
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, update_rule, batch_accumulator, state_count, input_scale=255.0): self.state_count = state_count self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_nature_network_dnn(input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_nature_network_dnn( input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.matrix('states') next_states = T.matrix('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') #buferis inputu viso batch self.states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #buferis i koki state patenka visiem self.next_states_shared = theano.shared( np.zeros((batch_size, state_count), dtype=theano.config.floatX)) #po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu? self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) #po 1 priimta action kiekvienam episode self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #?? turbut 0 ir 1, ar paskutine verte ar ne self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) #paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) #neaisku if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff**2) elif batch_accumulator == 'mean': loss = T.mean(diff**2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) # params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'adam': updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho, self.rms_epsilon) elif update_rule == 'adagrad': updates = lasagne.updates.adagrad(loss, params, self.lr, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) elif update_rule == 'momentum': updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, input_width, input_height, avail_actions, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, train_all, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.avail_actions = avail_actions self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.train_all = train_all lasagne.random.set_rng(self.rng) self.update_counter = 0 print "num_actions: " + str(num_actions) self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def main(game_name, network_type, updates_method, target_network_update_frequency, initial_epsilon, final_epsilon, test_epsilon, final_exploration_frame, replay_start_size, deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate, deepmind_rmsprop_rho, rmsprop_epsilon, rmsprop_learning_rate, rmsprop_rho, phi_type, phi_method, epoch_size, n_training_epochs, n_test_epochs, visualize, record_dir, show_mood, replay_memory_size, no_replay, repeat_action, skip_n_frames_after_lol, max_actions_per_game, weights_dir, algo_initial_state_file, log_frequency, theano_verbose): args = locals() if theano_verbose: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' if game_name == 'simple_breakout': game = simple_breakout.SimpleBreakout() class P(object): def __init__(self): self.screen_size = (12, 12) def __call__(self, frames): return frames phi = P() else: ale = ag.init(game=game_name, display_screen=(visualize == 'ale'), record_dir=record_dir) game = ag.ALEGame(ale) if phi_type == '4': phi = ag.Phi4(method=phi_method) elif phi_type == '1': phi = ag.Phi(method=phi_method) else: raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type)) if network_type == 'nature': build_network = network.build_nature elif network_type == 'nature_with_pad': build_network = network.build_nature_with_pad elif network_type == 'nips': build_network = network.build_nips elif network_type == 'nature_with_pad_he': build_network = network.build_nature_with_pad_he elif hasattr(network_type, '__call__'): build_network = network_type else: raise RuntimeError( "Unknown network: {network}".format(network=network_type)) if updates_method == 'deepmind_rmsprop': updates = \ lambda loss, params: u.deepmind_rmsprop(loss, params, learning_rate=deepmind_rmsprop_learning_rate, rho=deepmind_rmsprop_rho, epsilon=deepmind_rmsprop_epsilon) elif updates_method == 'rmsprop': updates = \ lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=rmsprop_learning_rate, rho=rmsprop_rho, epsilon=rmsprop_epsilon) else: raise RuntimeError( "Unknown updates: {updates}".format(updates=updates_method)) replay_memory = dqn.ReplayMemory( size=replay_memory_size) if not no_replay else None def create_algo(): algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, build_network=build_network, updates=updates, screen_size=phi.screen_size) algo.replay_start_size = replay_start_size algo.final_epsilon = final_epsilon algo.initial_epsilon = initial_epsilon algo.log_frequency = log_frequency algo.target_network_update_frequency = target_network_update_frequency algo.final_exploration_frame = final_exploration_frame return algo algo_train = create_algo() algo_test = create_algo() algo_test.final_epsilon = test_epsilon algo_test.initial_epsilon = test_epsilon algo_test.epsilon = test_epsilon import Queue algo_train.mood_q = Queue.Queue() if show_mood else None if show_mood is not None: import Queue algo_train.mood_q = Queue.Queue() if show_mood == 'plot': plot = Plot() elif show_mood == "log": plot = Log() def worker(): while True: item = algo_train.mood_q.get() plot.show(item) algo_train.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(algo_train)) if visualize != 'q': visualizer = q.GameNoVisualizer() else: if game_name == 'simple_breakout': visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train) else: visualizer = ag.ALEGameVisualizer(phi.screen_size) teacher = q.Teacher(game=game, algo=algo_train, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=False) tester = q.Teacher(game=game, algo=algo_test, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=True) q.teach_and_test(teacher, tester, n_epochs=n_training_epochs, frames_to_test_on=n_test_epochs * epoch_size, epoch_size=epoch_size, state_dir=weights_dir, algo_initial_state_file=algo_initial_state_file)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, eta, params_share=True, double_learning=False, annealing=False, temp=1.0, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.eta = eta self.params_share = params_share self.double_learning = double_learning self.annealing = annealing self.temp0 = temp lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out, self.l_feature, self.l_init = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out, self.next_l_feature, self.next_l_init = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat_share() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') exp_temp = T.scalar('exploration tuning') # Shared variables for training from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) images, along with the chosen action and resulting # reward and terminal status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames + 1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.exp_temp_shared = theano.shared(np.float32(self.temp0)) # default without annealing # Shared variable for a single state, to calculate q_vals. self.state_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) feature_vals = lasagne.layers.get_output(self.l_feature, states / input_scale) q_params = lasagne.layers.get_all_params(self.l_out) q_params_vals = lasagne.layers.get_all_param_values(self.l_out) if self.params_share: w_pi = q_params[-2] b_pi = q_params[-1] else: params_init = lasagne.layers.get_all_param_values(self.l_init) w_pi = theano.shared(params_init[-2]) b_pi = theano.shared(params_init[-1]) pi_vals = T.nnet.softmax(exp_temp * (T.dot(feature_vals, w_pi) + b_pi)) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) if self.double_learning: next_feature_vals = lasagne.layers.get_output(self.l_feature, next_states / input_scale) next_q_params = lasagne.layers.get_all_params(self.l_out) next_q_params_vals = lasagne.layers.get_all_param_values(self.l_out) if self.params_share: next_w_pi = next_q_params[-2] next_b_pi = next_q_params[-1] else: next_params_init = lasagne.layers.get_all_param_values(self.l_init) next_w_pi = theano.shared(next_params_init[-2]) next_b_pi = theano.shared(next_params_init[-1]) next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi)) next_pi_vals = theano.gradient.disconnected_grad(next_pi_vals) else: next_feature_vals = lasagne.layers.get_output(self.next_l_feature, next_states / input_scale) next_q_params = lasagne.layers.get_all_params(self.next_l_out) next_q_params_vals = lasagne.layers.get_all_param_values(self.next_l_out) if self.params_share: next_w_pi = next_q_params[-2] next_b_pi = next_q_params[-1] else: next_params_init = lasagne.layers.get_all_param_values(self.next_l_init) next_w_pi = theano.shared(next_params_init[-2]) next_b_pi = theano.shared(next_params_init[-1]) next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi)) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), actions.reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.sum(next_q_vals * next_pi_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) if self.params_share: params = lasagne.layers.helper.get_all_params(self.l_out) else: params = lasagne.layers.helper.get_all_params(self.l_out) params.append(next_w_pi) params.append(next_b_pi) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared, exp_temp: self.exp_temp_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.num_frames, self.input_height, self.input_width)) } pi_givens = { states: self.state_shared.reshape((1, self.num_frames, self.input_height, self.input_width)), exp_temp: self.exp_temp_shared } self._q_vals = theano.function([], q_vals[0], givens=q_givens) self._pi_vals = theano.function([], pi_vals[0], givens=pi_givens) grad_fc_w = T.grad(loss, self.l_out.W) self._grad = theano.function([], outputs=grad_fc_w, givens=train_givens)
def __init__(self, input_width, input_height, num_channels, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, network_params, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_channels = num_channels self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.lstm = None self.next_lstm = None logging.debug('network parameters', network_params) self.network_params = network_params lasagne.random.set_rng(self.rng) self.update_counter = 0 networks = self.build_network(network_type, num_channels, input_width, input_height, num_actions, num_frames, None) if isinstance(networks, tuple): self.l_out = networks[0] self.lstm = networks[1] else: self.l_out = networks # theano.compile.function_dump('network.dump', self.l_out) if self.freeze_interval > 0: next_networks = self.build_network(network_type, num_channels, input_width, input_height, num_actions, num_frames, None) if isinstance(next_networks, tuple): self.next_l_out = next_networks[0] self.next_lstm = next_networks[1] else: self.next_l_out = next_networks self.reset_q_hat() # This really really needs to be floats for now. # It makes sense if they use it for computations btensor5 = T.TensorType(theano.config.floatX, (False,) * 5) states = btensor5('states') next_states = btensor5('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Apparently needed for some layers with a variable input size # Weird, because the others just allow a None batch size, # but let's just play safe for now # For now, it should always look exactly like states # (n_batch, n_time_steps) # mask = T.imatrix('mask') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, num_channels, input_height, input_width), dtype=theano.config.floatX), name='states') self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, num_channels, input_height, input_width), dtype=theano.config.floatX), name='next_states') self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True), name='rewards') self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True), name='actions') self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # self.mask_shared = theano.shared(np.ones((batch_size, num_frames), # dtype='int32')) # lstmout = lasagne.layers.get_output(self.lstm, states / input_scale) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) # mask_input=mask) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale ) # mask_input=mask) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale ) # mask_input=mask) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(target.shape[0]), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) # print params givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': update_for = lambda params: deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': update_for = lambda params: lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': update_for = lambda params: lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) updates = update_for(params) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) # # Super mega shady stuff # # Somehow an update sneaks in for cell and hid. Kill it with fire if self.lstm: delete_keys = [k for k, v in updates.items() if k.name in ['cell', 'hid']] # print delete_keys for key in delete_keys: del updates[key] self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, use_double, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.use_double = use_double self.rng = rng # Using Double DQN is pointless without periodic freezing if self.use_double: assert self.freeze_interval > 0 # pass lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: # Nature. If using periodic freezing next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: # NIPS next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) if self.use_double: maxaction = T.argmax(q_vals, axis=1, keepdims=False) temptargets = next_q_vals[T.arange(batch_size),maxaction].reshape((-1, 1)) target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * temptargets) else: target = (rewards + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) def inspect_inputs(i, node, fn): if ('maxand' not in str(node).lower() and '12345' not in str(node)): return print i, node, "input(s) value(s):", [input[0] for input in fn.inputs], raw_input('press enter') def inspect_outputs(i, node, fn): if ('maxand' not in str(node).lower() and '12345' not in str(node)): return if '12345' in str(node): print "output(s) value(s):", [np.asarray(output[0]) for output in fn.outputs] else: print "output(s) value(s):", [output[0] for output in fn.outputs] raw_input('press enter') if False: self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens, mode=theano.compile.MonitorMode( pre_func=inspect_inputs, post_func=inspect_outputs)) theano.printing.debugprint(target) else: self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) if False: self._q_vals = theano.function([], q_vals, givens={states: self.states_shared}, mode=theano.compile.MonitorMode( pre_func=inspect_inputs, post_func=inspect_outputs)) else: self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def main(game_name, network_type, updates_method, target_network_update_frequency, initial_epsilon, final_epsilon, test_epsilon, final_exploration_frame, replay_start_size, deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate, deepmind_rmsprop_rho, rmsprop_epsilon, rmsprop_learning_rate, rmsprop_rho, phi_type, phi_method, epoch_size, n_training_epochs, n_test_epochs, visualize, record_dir, show_mood, replay_memory_size, no_replay, repeat_action, skip_n_frames_after_lol, max_actions_per_game, weights_dir, algo_initial_state_file, log_frequency, theano_verbose): args = locals() if theano_verbose: theano.config.compute_test_value = 'warn' theano.config.exception_verbosity = 'high' theano.config.optimizer = 'fast_compile' if game_name == 'simple_breakout': game = simple_breakout.SimpleBreakout() class P(object): def __init__(self): self.screen_size = (12, 12) def __call__(self, frames): return frames phi = P() else: ale = ag.init(game=game_name, display_screen=(visualize == 'ale'), record_dir=record_dir) game = ag.ALEGame(ale) if phi_type == '4': phi = ag.Phi4(method=phi_method) elif phi_type == '1': phi = ag.Phi(method=phi_method) else: raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type)) if network_type == 'nature': build_network = network.build_nature elif network_type == 'nature_with_pad': build_network = network.build_nature_with_pad elif network_type == 'nips': build_network = network.build_nips elif network_type == 'nature_with_pad_he': build_network = network.build_nature_with_pad_he elif hasattr(network_type, '__call__'): build_network = network_type else: raise RuntimeError("Unknown network: {network}".format(network=network_type)) if updates_method == 'deepmind_rmsprop': updates = \ lambda loss, params: u.deepmind_rmsprop(loss, params, learning_rate=deepmind_rmsprop_learning_rate, rho=deepmind_rmsprop_rho, epsilon=deepmind_rmsprop_epsilon) elif updates_method == 'rmsprop': updates = \ lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=rmsprop_learning_rate, rho=rmsprop_rho, epsilon=rmsprop_epsilon) else: raise RuntimeError("Unknown updates: {updates}".format(updates=updates_method)) replay_memory = dqn.ReplayMemory(size=replay_memory_size) if not no_replay else None def create_algo(): algo = dqn.DQNAlgo(game.n_actions(), replay_memory=replay_memory, build_network=build_network, updates=updates, screen_size=phi.screen_size) algo.replay_start_size = replay_start_size algo.final_epsilon = final_epsilon algo.initial_epsilon = initial_epsilon algo.log_frequency = log_frequency algo.target_network_update_frequency = target_network_update_frequency algo.final_exploration_frame = final_exploration_frame return algo algo_train = create_algo() algo_test = create_algo() algo_test.final_epsilon = test_epsilon algo_test.initial_epsilon = test_epsilon algo_test.epsilon = test_epsilon import Queue algo_train.mood_q = Queue.Queue() if show_mood else None if show_mood is not None: import Queue algo_train.mood_q = Queue.Queue() if show_mood == 'plot': plot = Plot() elif show_mood == "log": plot = Log() def worker(): while True: item = algo_train.mood_q.get() plot.show(item) algo_train.mood_q.task_done() import threading t = threading.Thread(target=worker) t.daemon = True t.start() print(str(algo_train)) if visualize != 'q': visualizer = q.GameNoVisualizer() else: if game_name == 'simple_breakout': visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train) else: visualizer = ag.ALEGameVisualizer(phi.screen_size) teacher = q.Teacher(game=game, algo=algo_train, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=False) tester = q.Teacher(game=game, algo=algo_test, game_visualizer=visualizer, phi=phi, repeat_action=repeat_action, max_actions_per_game=max_actions_per_game, skip_n_frames_after_lol=skip_n_frames_after_lol, tester=True) q.teach_and_test(teacher, tester, n_epochs=n_training_epochs, frames_to_test_on=n_test_epochs * epoch_size, epoch_size=epoch_size, state_dir=weights_dir, algo_initial_state_file=algo_initial_state_file)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network( network_type, input_width, input_height, num_actions, num_frames, batch_size ) if self.freeze_interval > 0: self.next_l_out = self.build_network( network_type, input_width, input_height, num_actions, num_frames, batch_size ) self.reset_q_hat( ) states, next_states = T.tensor4( 'states' ), T.tensor4( 'next_states' ) actions, rewards = T.icol( 'actions' ), T.col( 'rewards' ) terminals = T.icol( 'terminals' ) self.states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ), dtype = theano.config.floatX ) ) self.next_states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ), dtype = theano.config.floatX ) ) self.rewards_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = theano.config.floatX ), broadcastable = ( False, True ) ) self.actions_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ), broadcastable = ( False, True ) ) self.terminals_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ), broadcastable = ( False, True ) ) ## Get learned Q-values q_vals_test = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = True ) # q_vals_test = theano.gradient.disconnected_grad( q_vals_test ) q_vals_train = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = False ) if self.freeze_interval > 0: target_q_vals = lasagne.layers.get_output( self.next_l_out, next_states / input_scale, deterministic = True) else: target_q_vals = lasagne.layers.get_output( self.l_out, next_states / input_scale, deterministic = True) target_q_vals = theano.gradient.disconnected_grad( target_q_vals ) ## The traget depends on the received rewards and the discounted future ## reward stream for the given action in the current state. target = ( rewards + ( T.ones_like( terminals ) - terminals ) * self.discount * T.max( target_q_vals, axis = 1, keepdims = True ) ) ## target - b x 1, where b is batch size. ## q_vals - b x A, where A is the number of outputs of the Q-net ## Theano differentiates indexed (and reduced) arrays in a clever manner: ## it sets all left out gradients to zero. THIS IS CORRECT! ## \nabla_\theta diff = - 1_{a = a_j} \nabla Q( s, a_j, \theta) \,. diff = target - q_vals_train[ T.arange( batch_size ), actions.reshape( ( -1, ) ) ].reshape( ( -1, 1 ) ) if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], loss, updates=updates, givens=givens) self._q_vals = theano.function([], q_vals_test, givens={states: self.states_shared})
def __init__(self, model, slen, gamma=0.995, n_hidden1=25, n_hidden2=25, learning_rate=0.0002, freeze_interval=1000, momentum=0.0, learner_type="DDQN", minibatch_size=20, train_interval=100): self.freeze_interval = freeze_interval self.freeze_counter = 0 self.slen = slen #train data self.minibatch_size = minibatch_size self.train_interval = train_interval self.train_set_x = theano.shared(numpy.zeros( [minibatch_size * train_interval, slen], dtype=theano.config.floatX), borrow=True) self.train_set_y = theano.shared(numpy.zeros( [minibatch_size * train_interval, slen], dtype=theano.config.floatX), borrow=True) #variables self.index = T.lscalar() self.s = T.matrix('s') # the data is presented as rasterized images self.sp = T.matrix('sp') #s prime self.rng = numpy.random.RandomState(None) if learner_type == "DDQN": self.classifier = MLP_DDQN(rng=self.rng, input1=self.s, input2=self.sp, n_in=slen, n_hidden1=n_hidden1, n_hidden2=n_hidden2, n_out=3, model=model, gamma=gamma) elif learner_type == "DQN": self.classifier = MLP_DQN(rng=self.rng, input1=self.s, input2=self.sp, n_in=slen, n_hidden1=n_hidden1, n_hidden2=n_hidden2, n_out=3, model=model, gamma=gamma) self.cost_v = self.classifier.cost_v self.cost = self.classifier.cost self.rmsprop = RMSProp(self.classifier.params) self.gparams = [ T.grad(self.cost, param) for param in self.classifier.params ] # self.updates_no_m = self.rmsprop.updates(self.classifier.params,self.gparams,learning_rate,0.0) # self.updates = self.rmsprop.updates(self.classifier.params,self.gparams,learning_rate,momentum) self.updates = deepmind_rmsprop(self.gparams, self.classifier.params, learning_rate, momentum, 1e-4) self.model = (self.classifier.Wh1.get_value(borrow=True), self.classifier.Wh2.get_value(borrow=True), self.classifier.bh1.get_value(borrow=True), self.classifier.bh2.get_value(borrow=True), self.classifier.OW.get_value(borrow=True), self.classifier.Ob.get_value(borrow=True)) self.model_to_save = (self.classifier.Wh1.get_value(borrow=True), self.classifier.Wh2.get_value(borrow=True), self.classifier.bh1.get_value(borrow=True), self.classifier.bh2.get_value(borrow=True), self.classifier.OW.get_value(borrow=True), self.classifier.Ob.get_value(borrow=True)) self.to_save_id = 0 self.saved = True self.train_model_prioritize = theano.function( inputs=[self.index], outputs=self.cost_v, updates=self.updates, givens={ self.s: self.train_set_x[self.index * minibatch_size:(self.index + 1) * minibatch_size], self.sp: self.train_set_y[self.index * minibatch_size:(self.index + 1) * minibatch_size] }) self.train_model = theano.function( inputs=[self.index], outputs=self.cost, updates=self.updates, givens={ self.s: self.train_set_x[self.index * minibatch_size:(self.index + 1) * minibatch_size], self.sp: self.train_set_y[self.index * minibatch_size:(self.index + 1) * minibatch_size] }) self.report_action = theano.function(inputs=[self.s], outputs=self.classifier.aidx, allow_input_downcast=True) self.action = theano.function(inputs=[self.s], outputs=T.argmax(self.classifier.Qs), allow_input_downcast=True)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, lambda_reg, batch_accumulator, pretrained_net, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng self.lambda_reg = lambda_reg lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_in, self.l_act_in, self.l_out, self.pred_z, self.true_z = \ self.build_network(network_type, \ input_width, input_height, num_actions,\ num_frames, batch_size) if self.freeze_interval > 0: self.next_l_in, self.next_l_act_in, self.next_l_out, _d, _d = \ self.build_network(network_type, input_width, \ input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.imatrix('actions') terminals = T.icol('terminals') # Shared variables for training from a minibatch of replayed # state transitions, each consisting of num_frames + 1 (due to # overlap) images, along with the chosen action and resulting # reward and terminal status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames*2+1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, num_frames), dtype='int32') ) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate q_vals. self.state_shared = theano.shared( np.zeros((num_frames*2, input_height, input_width), dtype=theano.config.floatX)) q_vals, z_pred, z_true = lasagne.layers.get_output( [self.l_out, self.pred_z, self.true_z], inputs = {self.l_in: states / input_scale, self.l_act_in: actions} ) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output( self.next_l_out, {self.next_l_in: next_states / input_scale, self.next_l_act_in: actions} ) else: next_q_vals = lasagne.layers.get_output( self.l_out, {self.l_in: next_states / input_scale, self.l_act_in: actions} ) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), actions[:, 0].reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output diff_reg = z_true - z_pred if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 loss = loss + 0.5 * self.lambda_reg * (diff_reg ** 2).sum(axis=1) if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params([self.l_out, self.pred_z, self.true_z]) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.num_frames*2, self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
elif o in ("--dqn.network",): if a == 'nature': d["dqn.network"] = network.build_nature if a == 'nature_with_pad': d["dqn.network"] = network.build_nature_with_pad elif a == 'nips': d["dqn.network"] = network.build_nips elif a == 'nature_dnn': d["dqn.network"] = network.build_nature_dnn elif a == 'nips_dnn': d["dqn.network"] = network.build_nips_dnn elif o in ("--dqn.updates",): import updates if a == 'deepmind_rmsprop': d["dqn.updates"] = \ lambda loss, params: updates.deepmind_rmsprop(loss, params, learning_rate=.00025, rho=.95, epsilon=.01) elif a == 'rmsprop': d["dqn.updates"] = \ lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=.0002, rho=.95, epsilon=1e-6) else: assert False, "unhandled option" import pprint pp = pprint.PrettyPrinter(depth=2) print(optlist) print(args) print(sys.argv) print("") pp.pprint(d) main(**d)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0, double=False, transition_length=4): if double: print 'USING DOUBLE DQN' self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states_t') actions = T.icol('actions_t') target = T.col('evaluation_t') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.target_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.states_transition_shared = theano.shared( np.zeros((batch_size, transition_length * 2, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.states_one_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) """get Q(s) batch_size = 1 """ q1_givens = { states: self.states_one_shared.reshape( (1, self.num_frames, self.input_height, self.input_width)) } self._q1_vals = theano.function([], q_vals[0], givens=q1_givens) """get Q(s) batch_size = batch size """ q_batch_givens = { states: self.states_shared.reshape((self.batch_size, self.num_frames, self.input_height, self.input_width)) } self._q_batch_vals = theano.function([], q_vals, givens=q_batch_givens) action_mask = T.eq( T.arange(num_actions).reshape((1, -1)), actions.reshape( (-1, 1))).astype(theano.config.floatX) q_s_a = (q_vals * action_mask).sum(axis=1).reshape((-1, 1)) """ get Q(s,a) batch_size = batch size """ q_s_a_givens = { states: self.states_shared.reshape((self.batch_size, self.num_frames, self.input_height, self.input_width)), actions: self.actions_shared } self._q_s_a_vals = theano.function([], q_s_a, givens=q_s_a_givens) if self.freeze_interval > 0: q_target_vals = lasagne.layers.get_output(self.next_l_out, states / input_scale) else: q_target_vals = lasagne.layers.get_output(self.l_out, states / input_scale) q_target_vals = theano.gradient.disconnected_grad(q_target_vals) if not double: q_target = T.max(q_target_vals, axis=1) else: greedy_actions = T.argmax(q_vals, axis=1) q_target_mask = T.eq( T.arange(num_actions).reshape((1, -1)), greedy_actions.reshape((-1, 1)).astype(theano.config.floatX)) q_target = (q_target_vals * q_target_mask).sum(axis=1).reshape( (-1, 1)) """get Q target Q'(s,a') for a batch of transitions batch size = batch_size * transition length""" q_target_transition_givens = { states: self.states_transition_shared.reshape( (batch_size * transition_length * 2, self.num_frames, self.input_height, self.input_width)) } self._q_target = theano.function([], q_target.reshape( (batch_size, transition_length * 2)), givens=q_target_transition_givens) """get Q target_vals Q'(s) for a batch of transitions batch size = batch_size * transition length""" self._q_target_vals = theano.function( [], q_target_vals.reshape( (batch_size, transition_length * 2, num_actions)), givens=q_target_transition_givens) diff = q_s_a - target if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part else: loss = 0.5 * diff**2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) """Q(s,a) target train()""" train_givens = { states: self.states_shared, actions: self.actions_shared, target: self.target_shared } self._train = theano.function([], [loss], updates=updates, givens=train_givens, on_unused_input='warn') self._train2 = theano.function([], [loss], updates=updates, givens=train_givens, on_unused_input='warn')
elif o in ("--dqn.no_replay",): d["dqn.no_replay"] = True elif o in ("--dqn.network",): if a == 'nature': d["dqn.network"] = network.build_nature elif a == 'nips': d["dqn.network"] = network.build_nips elif a == 'nature_dnn': d["dqn.network"] = network.build_nature_dnn elif a == 'nips_dnn': d["dqn.network"] = network.build_nips_dnn elif o in ("--dqn.updates",): import updates if a == 'deepmind_rmsprop': d["dqn.updates"] = \ lambda loss, params: updates.deepmind_rmsprop(loss, params, learning_rate=.00025, rho=.95, epsilon=.1) elif a == 'rmsprop': d["dqn.updates"] = \ lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=.0002, rho=.95, epsilon=1e-6) else: assert False, "unhandled option" import pprint pp = pprint.PrettyPrinter(depth=2) print(optlist) print(args) print(sys.argv) print("") pp.pprint(d) main(**d)
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.gamma = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') #terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared(np.zeros( (batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # self.terminals_shared = theano.shared( # np.zeros((batch_size, 1), dtype='int32'), # broadcastable=(False,True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = rewards + self.gamma * T.max( next_q_vals, axis=1, keepdims=True) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1, ))].reshape((-1, 1)) if batch_accumulator == 'sum': loss = T.sum(diff**2) elif batch_accumulator == 'mean': loss = T.mean(diff**2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, #terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, input_scale=255.0): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng # print "NETWORK---------------------------" # print "input width ", self.input_width # print "input height", self.input_height # print "num actiuons", self.num_actions # print "num frames", self.num_frames # print "batch size", self.batch_size # print "discount", self.discount # print "rho", self.rho # print "lr", self.lr # print "rms_epsilon", self.rms_epsilon # print "momentum", self.momentum # print "clip_delta", self.clip_delta # print "freeze_ intercal", self.freeze_interval # print "rng", self.rng lasagne.random.set_rng(self.rng) self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') # Shared variables for training from a minibatch of replayed state transitions, # each consisting of num_frames + 1 (due to overlap) images, along with # the chosen action and resulting reward and termnial status. self.imgs_shared = theano.shared( np.zeros((batch_size, num_frames + 1, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) # Shared variable for a single state, to calculate q_vals self.state_shared = theano.shared( np.zeros((num_frames, input_height, input_width), dtype=theano.config.floatX)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) terminalsX = terminals.astype(theano.config.floatX) actionmask = T.eq(T.arange(num_actions).reshape((1, -1)), actions.reshape((-1, 1))).astype(theano.config.floatX) target = (rewards + (T.ones_like(terminalsX) - terminalsX) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1)) diff = target - output if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) train_givens = { states: self.imgs_shared[:, :-1], next_states: self.imgs_shared[:, 1:], rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss], updates=updates, givens=train_givens) q_givens = { states: self.state_shared.reshape((1, self.num_frames, self.input_height, self.input_width)) } self._q_vals = theano.function([], q_vals[0], givens=q_givens)
def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, update_rule, batch_accumulator, randomState, frame_scale=255.0): """ Initialize environment Arguments: environment - the environment (class Env) num_elements_in_batch - list of k integers for the number of each element kept as belief state num_actions - int discount - float learning_rate - float rho, rms_epsilon, momentum - float, float, float ... network_type - string ... """ self._environment = environment self._batchSize = batchSize self._inputDimensions = self._environment.inputDimensions() self._nActions = self._environment.nActions() self._df = 0 self.rho = rho self._lr = 0 self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self._randomState = randomState lasagne.random.set_rng(self._randomState) self.update_counter = 0 states=[] # list of symbolic variables for each of the k element in the belief state # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ] next_states=[] # idem than states at t+1 self.states_shared=[] # list of shared variable for each of the k element in the belief state self.next_states_shared=[] # idem that self.states_shared at t+1 for i, dim in enumerate(self._inputDimensions): if len(dim) == 3: states.append(T.tensor4("%s_%s" % ("state", i))) next_states.append(T.tensor4("%s_%s" % ("next_state", i))) elif len(dim) == 2: states.append(T.tensor3("%s_%s" % ("state", i))) next_states.append(T.tensor3("%s_%s" % ("next_state", i))) elif len(dim) == 1: states.append( T.matrix("%s_%s" % ("state", i)) ) next_states.append( T.matrix("%s_%s" % ("next_state", i)) ) self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False)) print("Number of observations per state: {}".format(len(self.states_shared))) print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions)) rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX) thelr = T.scalar(name='thelr', dtype=theano.config.floatX) self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states) print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv)) self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states) self._resetQHat() self.rewards_shared = theano.shared( np.zeros((batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batchSize, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out) next_q_vals = lasagne.layers.get_output(self.next_l_out) max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True) T_ones_like=T.ones_like(T.ones_like(terminals) - terminals) target = rewards + T_ones_like * thediscount * max_next_q_vals q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1)) diff = target - q_val if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: loss = 0.5 * diff ** 2 if batch_accumulator == 'sum': loss = T.sum(loss) elif batch_accumulator == 'mean': loss = T.mean(loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) for conv_param in self.l_outs_conv: for p in lasagne.layers.helper.get_all_params(conv_param): params.append(p) givens = { rewards: self.rewards_shared, actions: self.actions_shared, ## actions not needed! terminals: self.terminals_shared } for i, x in enumerate(self.states_shared): givens[ states[i] ] = x for i, x in enumerate(self.next_states_shared): givens[ next_states[i] ] = x if update_rule == 'deepmind_rmsprop': grads = get_or_compute_grads(loss, params) updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, thelr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates, givens=givens, on_unused_input='warn') givens2={} for i, x in enumerate(self.states_shared): givens2[ states[i] ] = x self._q_vals = theano.function([], q_vals, givens=givens2, on_unused_input='warn')
def __init__(self, input_width, input_height, num_actions, num_frames, discount, learning_rate, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, input_scale=255.0, reward_bias=0.): self.input_width = input_width self.input_height = input_height self.num_actions = num_actions self.num_frames = num_frames self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.update_counter = 0 self.l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) if self.freeze_interval > 0: self.next_l_out = self.build_network(network_type, input_width, input_height, num_actions, num_frames, batch_size) self.reset_q_hat() states = T.tensor4('states') next_states = T.tensor4('next_states') rewards = T.col('rewards') actions = T.icol('actions') terminals = T.icol('terminals') self.states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX)) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actions_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) self.terminals_shared = theano.shared( np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False, True)) q_vals = lasagne.layers.get_output(self.l_out, states / input_scale) if self.freeze_interval > 0: next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states / input_scale) else: next_q_vals = lasagne.layers.get_output(self.l_out, next_states / input_scale) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) target = (rewards + reward_bias + (T.ones_like(terminals) - terminals) * self.discount * T.max(next_q_vals, axis=1, keepdims=True)) diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1)) if self.clip_delta > 0: diff = diff.clip(-self.clip_delta, self.clip_delta) if batch_accumulator == 'sum': loss = T.sum(diff ** 2) elif batch_accumulator == 'mean': loss = T.mean(diff ** 2) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) params = lasagne.layers.helper.get_all_params(self.l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared } if update_rule == 'deepmind_rmsprop': updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'rmsprop': updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon) elif update_rule == 'sgd': updates = lasagne.updates.sgd(loss, params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens) self._q_vals = theano.function([], q_vals, givens={states: self.states_shared})
def __init__( self, state_width, action_width, action_bound, num_frames, discount, learning_rate, u_lr, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, update_rule, batch_accumulator, rng, ): self.state_width = state_width self.action_width = action_width self.action_bound = action_bound # TODO: 没用上 self.num_frames = num_frames # 就是phi_length self.batch_size = batch_size self.discount = discount self.rho = rho self.lr = learning_rate self.u_lr = u_lr self.rms_epsilon = rms_epsilon self.momentum = momentum self.clip_delta = clip_delta self.freeze_interval = freeze_interval self.rng = rng lasagne.random.set_rng(self.rng) self.update_counter = 0 ######init u_net###### """初始化策略网络u_net,包括 构造state等的符号变量和shared变量, 构造网络 给出动作u_acts 给出网络参数u_params """ states = T.tensor4("states") next_states = T.tensor4("next_states") rewards = T.col("rewards") actions = T.matrix("actions") terminals = T.icol("terminals") self.states_shared = theano.shared( np.zeros((batch_size, num_frames, state_width, 1), dtype=theano.config.floatX) ) # 是上面这四个维度 self.next_states_shared = theano.shared( np.zeros((batch_size, num_frames, state_width, 1), dtype=theano.config.floatX) ) self.rewards_shared = theano.shared( np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True) ) self.actions_shared = theano.shared( np.zeros((batch_size, action_width), dtype=theano.config.floatX), broadcastable=(False, True) ) self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype="int32"), broadcastable=(False, True)) self.u_l_out = self.build_u_network(network_type, state_width, 1, action_width, num_frames, batch_size) u_acts = lasagne.layers.get_output(self.u_l_out, states) u_params = lasagne.layers.helper.get_all_params(self.u_l_out) ######------###### ######init q_net##### """初始化评价网络q_net,包括 构造网络 给出评价q_vals 给出下一时刻的评价next_q_vals 给出td error:diff 给出网络参数u_params 有了以上两个经过中间变量q_loss的计算,给出q_updates """ self.q_l_out, in_l1, in_l2 = self.build_q_network( network_type, state_width, 1, action_width, num_frames, batch_size ) if self.freeze_interval > 0: # 这是什么? self.next_q_l_out = self.build_q_network(network_type, state_width, 1, action_width, num_frames, batch_size) self.reset_q_hat() # 输入在下面自己定义,注意有state和actions两个都是输入;输出要是(batch*1)的;注意这里action要用输入的真action q_vals = lasagne.layers.get_output(self.q_l_out, {in_l1: states, in_l2: actions}) # TODO: 现在的问题就是这一句该怎么写 if self.freeze_interval > 0: # 这是什么? next_q_vals = lasagne.layers.get_output(self.next_q_l_out, {in_l1: next_states, in_l2: u_acts}) else: next_q_vals = lasagne.layers.get_output(self.q_l_out, {in_l1: next_states, in_l2: u_acts}) next_q_vals = theano.gradient.disconnected_grad(next_q_vals) # DPG中公式(16)的delta_t,这里和DQN很不同 diff = (rewards + (T.ones_like(terminals) - terminals) * self.discount * next_q_vals) - q_vals # 17,18两个公式自己写吧,要直接卸T.grad对公式里两个求梯度部分自己求了。另外17式怎么出来的 if self.clip_delta > 0: # If we simply take the squared clipped diff as our loss, # then the gradient will be zero whenever the diff exceeds # the clip bounds. To avoid this, we extend the loss # linearly past the clip point to keep the gradient constant # in that regime. # # This is equivalent to declaring d loss/d q_vals to be # equal to the clipped diff, then backpropagating from # there, which is what the DeepMind implementation does. quadratic_part = T.minimum(abs(diff), self.clip_delta) linear_part = abs(diff) - quadratic_part q_loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part else: q_loss = 0.5 * diff ** 2 # 果然目标函数q_loss主要就是diff,是sita的函数。反正是求偏导,等于当做reward是与sita无关的量(定量)。 if batch_accumulator == "sum": q_loss = T.sum(q_loss) # shape (1) elif batch_accumulator == "mean": q_loss = T.mean(q_loss) else: raise ValueError("Bad accumulator: {}".format(batch_accumulator)) q_params = lasagne.layers.helper.get_all_params(self.q_l_out) givens = { states: self.states_shared, next_states: self.next_states_shared, rewards: self.rewards_shared, actions: self.actions_shared, terminals: self.terminals_shared, } if update_rule == "deepmind_rmsprop": q_updates = deepmind_rmsprop(q_loss, q_params, self.lr, self.rho, self.rms_epsilon) elif update_rule == "rmsprop": q_updates = lasagne.updates.rmsprop(q_loss, q_params, self.lr, self.rho, self.rms_epsilon) elif update_rule == "sgd": q_updates = lasagne.updates.sgd(q_loss, q_params, self.lr) else: raise ValueError("Unrecognized update: {}".format(update_rule)) ######------###### """ 先给出u_updates(由于u_updates和q_upates有依赖,放在这里才给出) 给出总的updates,于是就能够训练了 给出符号函数_train 给出网络输出的符号函数get_u_acts和get_q_vals """ # 忽略124-136,重写updates; # 比如这里q_loss对q_params求导 # opdac_rmsprop 完成公式(18) if batch_accumulator == "sum": acm_u_acts = T.sum(u_acts) # 这里先这么粗暴的写了,在acts只有一维的时候可以这样shape (0) acm_q = T.sum(q_vals) elif batch_accumulator == "mean": acm_u_acts = T.mean(u_acts) acm_q = T.mean(q_vals) u_updates = opdac_rmsprop( acm_q, actions, acm_u_acts, u_params, self.u_lr, False ) # TODO: 这里该不该填states,还是该填states_shared self.get_u_acts = theano.function([], u_acts, givens={states: self.states_shared}) # 这个函数get_q_vals或许用不上 self.get_q_vals = theano.function([], q_vals, givens={states: self.states_shared, actions: self.actions_shared}) # 另一种表达写法updates=OrderedDict(q_updates,**u_updates),意思都是合并两个字典 updates = OrderedDict(q_updates.items() + u_updates.items()) if self.momentum > 0: updates = lasagne.updates.apply_momentum(updates, None, self.momentum) # 这个就是公式(16)(17)啦 self._train = theano.function( [], [q_loss, q_vals], updates=updates, givens=givens ) # 哦!!!你这样拿givens换就可以每次给进来新的值;