def __init__(self, n_actions, replay_memory, initial_weights_file=None): self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.ignore_feedback = False self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_frames = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 50 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var = T.tensor4("s0", dtype=theano.config.floatX), T.bmatrix( "a0"), T.wcol( "r0"), T.tensor4("s1", dtype=theano.config.floatX), T.bcol( "future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_cnn(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256)) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True)) self.network_stale = build_cnn(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256)) print("Compiling forward stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output(self.network_stale, deterministic=True)) if initial_weights_file is not None: with np.load(initial_weights_file) as initial_weights: param_values = [initial_weights['arr_%d' % i] for i in range(len(initial_weights.files))] lasagne.layers.set_all_param_values(self.network, param_values) self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss(out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) updates = lasagne.updates.rmsprop(self.loss, params, learning_rate=0.0002, rho=0.95, epsilon=1e-6) # TODO RMSPROP in the paper has slightly different definition (see Lua) print("Compiling train_fn.") self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale], updates=updates) print("Compiling loss_fn.") self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss)
def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None): self.screen_width, self.screen_height = screen_size self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_action = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 1 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var = T.tensor4("s0", dtype=theano.config.floatX) a0_var = T.bmatrix("a0") r0_var = T.wcol("r0") s1_var = T.tensor4("s1", dtype=theano.config.floatX) future_reward_indicator_var = T.bcol("future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_network(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True)) self.network_stale = build_network(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward_stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output(self.network_stale, deterministic=True)) self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss(out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) print("Compiling train_fn.") self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale], updates=updates(self.loss, params)) print("Compiling loss_fn.") self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss)
def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None): self.screen_width, self.screen_height = screen_size self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_action = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 1 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var = T.tensor4("s0", dtype=theano.config.floatX) a0_var = T.bmatrix("a0") r0_var = T.wcol("r0") s1_var = T.tensor4("s1", dtype=theano.config.floatX) future_reward_indicator_var = T.bcol("future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_network( n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output( self.network, deterministic=True)) self.network_stale = build_network( n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward_stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output( self.network_stale, deterministic=True)) self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss( out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) print("Compiling train_fn.") self.train_fn = theano.function( [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [ self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale ], updates=updates(self.loss, params)) print("Compiling loss_fn.") self.loss_fn = theano.function( [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss)