def generate_insurance_model(env=None, lr=.0001, memory_len=100, target_model_update=.09): ins_actor = Sequential() ins_actor.add(Flatten(input_shape=(1, ) + (env.NUM_INSURANCES, 21))) ins_actor.add(Dense(NUM_HIDDEN_UNITS)) ins_actor.add(Activation('relu')) ins_actor.add(Dense(NUM_HIDDEN_UNITS)) ins_actor.add(Activation('relu')) ins_actor.add(Dense(NUM_HIDDEN_UNITS)) ins_actor.add(Activation('relu')) ins_actor.add(Dense(1)) ins_actor.add(Activation('softsign')) # print(ins_actor.summary()) # print(ins_actor.layers[-1].activation) action_input = Input(shape=(1, ), name='action_input') observation_input = Input(shape=(1, ) + (env.NUM_INSURANCES, 21), name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(NUM_HIDDEN_UNITS)(x) x = Activation('relu')(x) x = Dense(NUM_HIDDEN_UNITS)(x) x = Activation('relu')(x) x = Dense(NUM_HIDDEN_UNITS)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('softsign')(x) ins_critic = Model(inputs=[action_input, observation_input], outputs=x) # print(ins_critic.summary(())) ins_memory = SequentialMemory(limit=memory_len, window_length=1) # ins_random_process = OrnsteinUhlenbeckProcess(size=1, theta=.15, mu=0, sigma=.3) ins_random_process = GaussianWhiteNoiseProcess(mu=0, sigma=0.2, sigma_min=0.005, n_steps_annealing=5000) # ins_random_process = None ins_agent = DDPGAgent(nb_actions=1, actor=ins_actor, critic=ins_critic, critic_action_input=action_input, memory=ins_memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=ins_random_process, gamma=.99, target_model_update=target_model_update) # ins_agent.processor = MultiInputProcessor(3) ins_agent.compile(Adam(lr=lr, clipnorm=1.), metrics=['mae']) print(type(ins_agent)) return ins_agent
class DDPG(BaseAgent): def __init__(self, actor, critic, critic_action_input, processor, random_process, num_actions): # Replay memory memory = SequentialMemory(limit=opt.ddpg_replay_memory_size, window_length=opt.ddpg_window_length) self.agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=critic_action_input, memory=memory, nb_actions=num_actions, processor=processor, batch_size=opt.ddpg_batch_size, nb_steps_warmup_actor=opt.ddpg_nb_steps_warmup_actor, nb_steps_warmup_critic=opt.ddpg_nb_steps_warmup_critic, target_model_update=opt.ddpg_target_model_update, random_process=random_process, train_interval=opt.ddpg_train_interval) self.agent.compile([keras.optimizers.Adam(lr=opt.ddpg_learning_rate_actor), keras.optimizers.Adam(lr=opt.ddpg_learning_rate_critic)], metrics=['mae']) def fit(self, env, num_steps, weights_path=None, visualize=False): callbacks = [] if weights_path is not None: callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)] self.agent.fit(env=env, nb_steps=num_steps, action_repetition=opt.ddpg_action_repetition, callbacks=callbacks, log_interval=opt.log_interval, test_interval=opt.test_interval, test_nb_episodes=opt.test_nb_episodes, test_action_repetition=opt.ddpg_action_repetition, visualize=visualize, test_visualize=visualize, verbose=2) def test(self, env, num_episodes, visualize=False): self.agent.test(env=env, nb_episodes=num_episodes, action_repetition=opt.dqn_action_repetition, verbose=2, visualize=visualize) def save(self, out_dir): self.agent.save_weights(out_dir, overwrite=True) def load(self, out_dir): self.agent.load_weights(out_dir)
def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2, memory1, memory2, gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf, random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001, **kwargs): super(CoopDDPG, self).__init__() self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process1, custom_model_objects, target_model_update, **kwargs) self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process2, custom_model_objects, target_model_update, **kwargs)
def __init__(self, actor, critic, critic_action_input, processor, random_process, num_actions): # Replay memory memory = SequentialMemory(limit=opt.ddpg_replay_memory_size, window_length=opt.ddpg_window_length) self.agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=critic_action_input, memory=memory, nb_actions=num_actions, processor=processor, batch_size=opt.ddpg_batch_size, nb_steps_warmup_actor=opt.ddpg_nb_steps_warmup_actor, nb_steps_warmup_critic=opt.ddpg_nb_steps_warmup_critic, target_model_update=opt.ddpg_target_model_update, random_process=random_process, train_interval=opt.ddpg_train_interval) self.agent.compile([keras.optimizers.Adam(lr=opt.ddpg_learning_rate_actor), keras.optimizers.Adam(lr=opt.ddpg_learning_rate_critic)], metrics=['mae'])
def test_multi_ddpg_input(): nb_actions = 2 actor_observation_input1 = Input(shape=(2, 3), name='actor_observation_input1') actor_observation_input2 = Input(shape=(2, 4), name='actor_observation_input2') actor = Sequential() x = Concatenate()([actor_observation_input1, actor_observation_input2]) x = Flatten()(x) x = Dense(nb_actions)(x) actor = Model(inputs=[actor_observation_input1, actor_observation_input2], outputs=x) action_input = Input(shape=(nb_actions,), name='action_input') critic_observation_input1 = Input(shape=(2, 3), name='critic_observation_input1') critic_observation_input2 = Input(shape=(2, 4), name='critic_observation_input2') x = Concatenate()([critic_observation_input1, critic_observation_input2]) x = Concatenate()([action_input, Flatten()(x)]) x = Dense(1)(x) critic = Model(inputs=[action_input, critic_observation_input1, critic_observation_input2], outputs=x) processor = MultiInputProcessor(nb_inputs=2) memory = SequentialMemory(limit=10, window_length=2) agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4, processor=processor) agent.compile('sgd') agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
def test_single_ddpg_input(): nb_actions = 2 actor = Sequential() actor.add(Flatten(input_shape=(2, 3))) actor.add(Dense(nb_actions)) action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(2, 3), name='observation_input') x = Concatenate()([action_input, Flatten()(observation_input)]) x = Dense(1)(x) critic = Model(inputs=[action_input, observation_input], outputs=x) memory = SequentialMemory(limit=10, window_length=2) agent = DDPGAgent(actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_actions=2, nb_steps_warmup_critic=5, nb_steps_warmup_actor=5, batch_size=4) agent.compile('sgd') agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
env = populate_env(gym.make("MountainCarContinuous-v0")) # Build the actor and the critic actor = simple_actor(env) critic = simple_critic(env) # Memory memory = SimpleMemory(env=env, limit=1000000) # Noise random_process = OrnsteinUhlenbeckProcess( size=env.action_space.dim, theta=.15, mu=0., sigma=3.) # Agent agent = DDPGAgent( actor=actor, critic=critic, env=env, memory=memory, random_process=random_process ) agent.compile() agent.train( env=env, nb_episodes=1, visualize=False, verbose=2, nb_max_episode_steps=200, plots=False)
def __init__(self, name, env, grayscale, width, height): super(DDPGLearner, self).__init__(name=name, env=env) self.nb_actions = env.available_actions self.abs_max_reward = env.abs_max_reward self.mission_name = env.mission_name self.grayscale = grayscale self.width = width self.height = height self.recurrent = False # Use LSTM self.batch_size = 32 self.window_length = 4 if tf: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) tensorflow_backend.set_session(session=sess) if not self.recurrent: self.actor, self.critic, self.action_input = Minecraft_DDPG( self.window_length, self.grayscale, self.width, self.height, self.nb_actions) else: self.actor, self.critic, self.action_input = Minecraft_DDPG_LSTM( self.window_length, self.grayscale, self.width, self.height, self.nb_actions) # Replay memory self.memory = SequentialMemory(limit=1000000, window_length=self.window_length) # Add random noise for exploration self.random_process = GaussianWhiteNoiseProcess(mu=0.0, sigma=0.5, size=self.nb_actions) ''' # We can also generate exploration noise with different parameters for each action. This is because we may want # eg. the agent to be more likely to explore moving forward than backward. In that case, a list or tuple of # random processes, one for each action, must be passed to the agent. # For example: self.random_process = [] self.random_process.append(GaussianWhiteNoiseProcess(mu=1.5, sigma=1.0)) # For moving self.random_process.append(GaussianWhiteNoiseProcess(mu=0.0, sigma=1.0)) # For turning ''' self.processor = MalmoProcessor(self.grayscale, self.window_length, self.recurrent, self.abs_max_reward) self.agent = DDPGAgent(actor=self.actor, critic=self.critic, critic_action_input=self.action_input, nb_actions=self.nb_actions, memory=self.memory, batch_size=self.batch_size, processor=self.processor, random_process=self.random_process, gamma=0.99, nb_steps_warmup_actor=10000, nb_steps_warmup_critic=10000, target_model_update=1e-3) self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])
class DDPGLearner(BaseAgent): def __init__(self, name, env, grayscale, width, height): super(DDPGLearner, self).__init__(name=name, env=env) self.nb_actions = env.available_actions self.abs_max_reward = env.abs_max_reward self.mission_name = env.mission_name self.grayscale = grayscale self.width = width self.height = height self.recurrent = False # Use LSTM self.batch_size = 32 self.window_length = 4 if tf: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) tensorflow_backend.set_session(session=sess) if not self.recurrent: self.actor, self.critic, self.action_input = Minecraft_DDPG( self.window_length, self.grayscale, self.width, self.height, self.nb_actions) else: self.actor, self.critic, self.action_input = Minecraft_DDPG_LSTM( self.window_length, self.grayscale, self.width, self.height, self.nb_actions) # Replay memory self.memory = SequentialMemory(limit=1000000, window_length=self.window_length) # Add random noise for exploration self.random_process = GaussianWhiteNoiseProcess(mu=0.0, sigma=0.5, size=self.nb_actions) ''' # We can also generate exploration noise with different parameters for each action. This is because we may want # eg. the agent to be more likely to explore moving forward than backward. In that case, a list or tuple of # random processes, one for each action, must be passed to the agent. # For example: self.random_process = [] self.random_process.append(GaussianWhiteNoiseProcess(mu=1.5, sigma=1.0)) # For moving self.random_process.append(GaussianWhiteNoiseProcess(mu=0.0, sigma=1.0)) # For turning ''' self.processor = MalmoProcessor(self.grayscale, self.window_length, self.recurrent, self.abs_max_reward) self.agent = DDPGAgent(actor=self.actor, critic=self.critic, critic_action_input=self.action_input, nb_actions=self.nb_actions, memory=self.memory, batch_size=self.batch_size, processor=self.processor, random_process=self.random_process, gamma=0.99, nb_steps_warmup_actor=10000, nb_steps_warmup_critic=10000, target_model_update=1e-3) self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) def fit(self, env, nb_steps): weights_dir = 'weights/{}'.format(self.mission_name) if not os.path.exists(weights_dir): os.makedirs(weights_dir) weights_path = os.path.join(weights_dir, '{}'.format(self.name)) callbacks = [ ModelIntervalCheckpoint(weights_path, interval=10000, verbose=1) ] self.agent.fit(env, nb_steps, action_repetition=4, callbacks=callbacks, verbose=1, log_interval=10000, test_interval=10000, test_nb_episodes=10, test_action_repetition=4, test_visualize=False) def test(self, env, nb_episodes): self.agent.test(env, nb_episodes, action_repetition=4, callbacks=None, verbose=1, visualize=False) def save(self, out_dir): self.agent.save_weights(out_dir, overwrite=True) def load(self, out_dir): self.agent.load_weights(out_dir)
(step_length * plan_horizon) * 5 ) # episode length / (times per action * min v) # turn left agent left_processor = WhiteningNormalizerProcessor() left_memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGTH) left_random_process = OrnsteinUhlenbeckProcess(size=lower_nb_actions, theta=RANDOM_PROCESS_THETA, mu=RANDOM_PROCESS_MU, sigma=RANDOM_PROCESS_SIGMA) left_agent = DDPGAgent(processor=left_processor, nb_actions=lower_nb_actions, actor=left_actor_model, critic=left_critic_model, critic_action_input=critic_action_input, memory=left_memory, nb_steps_warmup_critic=NB_STEPS_WARMUP_CRITIC, nb_steps_warmup_actor=NB_STEPS_WARMUP_ACTOR, random_process=left_random_process, gamma=GAMMA, target_model_update=TARGET_MODEL_UPDATE, batch_size=BATCH_SIZE_LOWER) left_agent.compile(Adam(lr=OPTIMIZER_LR, clipnorm=OPTIMIZER_CLIPNORM), metrics=['mae']) # go straight agent straight_processor = WhiteningNormalizerProcessor() straight_memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGTH) straight_random_process = OrnsteinUhlenbeckProcess(size=lower_nb_actions, theta=RANDOM_PROCESS_THETA, mu=RANDOM_PROCESS_MU,
action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32, activation='relu')(x) x = Dense(32, activation='tanh')(x) x = Dense(32, activation='relu')(x) x = Dense(1, activation='linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # memory = EpisodeParameterMemory(limit=1000000, window_length=1) memory = SequentialMemory(limit=1000000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=400, nb_steps_warmup_actor=400, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile( # Adam(lr=.001, clipnorm=1.), RMSprop(centered=True), metrics=['mae'] ) total_steps = 50000 if mode == 'train': if test_batch > 0: agent.load_weights('weights/{}{}_batch_{}_x_{}_params.h5f'.format(ENV_NAME, label, test_batch, total_steps)) max_steps = 300 * ((test_batch / 2) + 1)
for experiment in my_expe.experiments(5): # Get the environment # And populate it with useful metadata env = populate_env(gym.make("MountainCarContinuous-v0")) # Build the actor and the critic actor = simple_actor(env) critic = simple_critic(env) # Memory memory = SimpleMemory(env=env, limit=1000000) # Noise random_process = OrnsteinUhlenbeckProcess(size=env.action_space.dim, theta=.15, mu=0., sigma=3.) # Agent agent = DDPGAgent( actor=actor, critic=critic, env=env, memory=memory, random_process=random_process, experiment=experiment, ) agent.compile() agent.train(episodes=1)
class CoopActionOtherDDPG(Agent): # Two Agents, who can measure the output of the other (Based on Keras-rl agent impl.) def forward(self, observation): raise NotImplementedError def backward(self, reward, terminal): raise NotImplementedError def load_weights(self, filepath): raise NotImplementedError def save_weights(self, filepath, overwrite=False): raise NotImplementedError @property def layers(self): raise NotImplementedError def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2, memory1, memory2, gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf, random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001, **kwargs): super(CoopActionOtherDDPG, self).__init__() self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process1, custom_model_objects, target_model_update, **kwargs) self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process2, custom_model_objects, target_model_update, **kwargs) def compile(self, optimizer, metrics=[]): self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics)) self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics)) def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): """Trains the agent on the given environment. # Arguments env: (`Env` instance): Environment that the agent interacts with. See [Env](#env) for details. nb_steps (integer): Number of training steps to be performed. action_repetition (integer): Number of times the agent repeats the same action without observing the environment again. Setting this to a value > 1 can be useful if a single action only has a very small effect on the environment. callbacks (list of `keras.callbacks.Callback` or `rl.callbacks.Callback` instances): List of callbacks to apply during training. See [callbacks](/callbacks) for details. verbose (integer): 0 for no logging, 1 for interval logging (compare `log_interval`), 2 for episode logging visualize (boolean): If `True`, the environment is visualized during training. However, this is likely going to slow down training significantly and is thus intended to be a debugging instrument. nb_max_start_steps (integer): Number of maximum steps that the agent performs at the beginning of each episode using `start_step_policy`. Notice that this is an upper limit since the exact number of steps to be performed is sampled uniformly from [0, max_start_steps] at the beginning of each episode. start_step_policy (`lambda observation: action`): The policy to follow if `nb_max_start_steps` > 0. If set to `None`, a random action is performed. log_interval (integer): If `verbose` = 1, the number of steps that are considered to be an interval. nb_max_episode_steps (integer): Number of steps per episode that the agent performs before automatically resetting the environment. Set to `None` if each episode should run (potentially indefinitely) until the environment signals a terminal state. # Returns A `keras.callbacks.History` instance that recorded the entire training process. """ if not (self.agent1.compiled and self.agent2.compiled): raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) assert self.processor is None # Removed processors here for simplification. Not needed anyway assert nb_max_start_steps == 0 # Removed here for simplification. Not needed anyway assert action_repetition == 1 # Removed here for simplification. Not needed anyway self.agent1.training = True self.agent2.training = True experience_for_plotting = deque() callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self.agent1._on_train_begin() self.agent2._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.agent1.step = np.int16(0) self.agent2.step = np.int16(0) observation1 = observation2 = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while self.agent1.step < nb_steps: # not individual for now if observation1 is None or observation2 is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) # Obtain the initial observation by resetting the environment. self.agent1.reset_states() self.agent2.reset_states() obs = env.reset() observation1 = deepcopy(obs) + (0.,) observation2 = deepcopy(obs) + (0.,) # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation1 is not None assert observation2 is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1 = np.ndarray.item(self.agent1.forward(observation1)) action2 = np.ndarray.item(self.agent2.forward(observation2)) action = (action1, action2) reward1 = np.float32(0) reward2 = np.float32(0) accumulated_info = {} done = False callbacks.on_action_begin(action) # Use only one of the actions? added actions? obs, r, done, info = env.step(action) if done: raise AttributeError # The episode was reset unexpectedly # (see https://stackoverflow.com/questions/42787924/) observation1 = deepcopy(obs) + (info["u2_clipped"],) # Add action other to the observation observation2 = deepcopy(obs) + (info["u1_clipped"],) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward1 += info["r1"] reward2 += info["r2"] if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = self.agent1.backward(reward1, terminal=done) metrics2 = self.agent2.backward(reward2, terminal=done) episode_reward1 += reward1 episode_reward2 += reward2 step_logs = { 'action': action[0] + action[1], 'observation': observation1, 'reward': reward1 + reward2, 'metrics': metrics1, # not individual for now 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.agent1.step += 1 self.agent2.step += 1 if len(obs) == 2: experience_for_plotting.append((info["t"], obs, (info["u1_clipped"], info["u2_clipped"]), (0., 0.), r, (info["r1"], info["r2"]))) if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.agent1.forward(observation1) self.agent2.forward(observation2) self.agent1.backward(0., terminal=False) self.agent2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1 + episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': self.agent1.step, # not individual for now } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation1 = None observation2 = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self.agent1._on_train_end() self.agent2._on_train_end() return experience_for_plotting
class CoopDDPG(Agent): # Two Agents, who can not measure the output of the other (Based on Keras-rl agent impl.) def forward(self, observation): raise NotImplementedError def backward(self, reward, terminal): raise NotImplementedError def load_weights(self, filepath): raise NotImplementedError def save_weights(self, filepath, overwrite=False): raise NotImplementedError @property def layers(self): raise NotImplementedError def __init__(self, nb_actions, actor1, actor2, critic1, critic2, critic_action_input1, critic_action_input2, memory1, memory2, gamma=.99, batch_size=32, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, train_interval=1, memory_interval=1, delta_range=None, delta_clip=np.inf, random_process1=None, random_process2=None, custom_model_objects={}, target_model_update=.001, **kwargs): super(CoopDDPG, self).__init__() self.agent1 = DDPGAgent(nb_actions, actor1, critic1, critic_action_input1, memory1, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process1, custom_model_objects, target_model_update, **kwargs) self.agent2 = DDPGAgent(nb_actions, actor2, critic2, critic_action_input2, memory2, gamma, batch_size, nb_steps_warmup_critic, nb_steps_warmup_actor, train_interval, memory_interval, delta_range, delta_clip, random_process2, custom_model_objects, target_model_update, **kwargs) def compile(self, optimizer, metrics=[]): self.agent1.compile(clone_optimizer(optimizer), deepcopy(metrics)) self.agent2.compile(clone_optimizer(optimizer), deepcopy(metrics)) def fit(self, env, nb_steps, action_repetition=1, callbacks=None, verbose=1, visualize=False, nb_max_start_steps=0, start_step_policy=None, log_interval=10000, nb_max_episode_steps=None): if not (self.agent1.compiled and self.agent2.compiled): raise RuntimeError( 'Your tried to fit your agent but it hasn\'t been compiled yet. Please call `compile()` before `fit()`.') if action_repetition < 1: raise ValueError('action_repetition must be >= 1, is {}'.format(action_repetition)) self.agent1.training = True self.agent2.training = True callbacks = [] if not callbacks else callbacks[:] if verbose == 1: callbacks += [TrainIntervalLogger(interval=log_interval)] elif verbose > 1: callbacks += [TrainEpisodeLogger()] if visualize: callbacks += [Visualizer()] history = History() callbacks += [history] callbacks = CallbackList(callbacks) if hasattr(callbacks, 'set_model'): callbacks.set_model(self) else: callbacks._set_model(self) callbacks._set_env(env) params = { 'nb_steps': nb_steps, } if hasattr(callbacks, 'set_params'): callbacks.set_params(params) else: callbacks._set_params(params) self.agent1._on_train_begin() self.agent2._on_train_begin() callbacks.on_train_begin() episode = np.int16(0) self.agent1.step = np.int16(0) self.agent2.step = np.int16(0) observation = None episode_reward1 = None episode_reward2 = None episode_step = None did_abort = False try: while self.agent1.step < nb_steps: # not individual for now if observation is None: # start of a new episode callbacks.on_episode_begin(episode) episode_step = np.int16(0) episode_reward1 = np.float32(0) episode_reward2 = np.float32(0) # Obtain the initial observation by resetting the environment. self.agent1.reset_states() self.agent2.reset_states() observation = deepcopy(env.reset()) if self.agent1.processor is not None: # not individual for now observation = self.agent1.processor.process_observation(observation) assert observation is not None # Perform random starts at beginning of episode and do not record them into the experience. # This slightly changes the start position between games. nb_random_start_steps = 0 if nb_max_start_steps == 0 else np.random.randint(nb_max_start_steps) for _ in range(nb_random_start_steps): if start_step_policy is None: action = env.action_space.sample() else: action = start_step_policy(observation) if self.agent1.processor is not None: # not individual for now. action is not from agent anyway action = self.agent1.processor.process_action(action) callbacks.on_action_begin(action) observation, reward, done, info = env.step(action) observation = deepcopy(observation) if self.agent1.processor is not None: observation, reward, done, info = self.agent1.processor.process_step(observation, reward, done, info) callbacks.on_action_end(action) if done: warnings.warn( 'Env ended before {} random steps could be performed at the start. ' 'You should probably lower the `nb_max_start_steps` parameter.'.format( nb_random_start_steps)) observation = deepcopy(env.reset()) if self.agent1.processor is not None: observation = self.agent1.processor.process_observation(observation) break # At this point, we expect to be fully initialized. assert episode_reward1 is not None assert episode_reward2 is not None assert episode_step is not None assert observation is not None # Run a single step. callbacks.on_step_begin(episode_step) # This is were all of the work happens. We first perceive and compute the action # (forward step) and then use the reward to improve (backward step). action1 = self.agent1.forward(observation) action2 = self.agent2.forward(observation) if self.agent1.processor is not None: action1 = self.agent1.processor.process_action(action1) if self.agent2.processor is not None: action2 = self.agent2.processor.process_action(action2) action = (np.ndarray.item(action1), np.ndarray.item(action2)) reward1 = np.float32(0) reward2 = np.float32(0) reward = np.float32(0) accumulated_info = {} done = False for _ in range(action_repetition): callbacks.on_action_begin(action) # Use only one of the actions? added actions? observation, r, done, info = env.step(action) observation = deepcopy(observation) if self.agent1.processor is not None: observation, r, done, info = self.agent1.processor.process_step(observation, r, done, info) for key, value in info.items(): if not np.isreal(value): continue if key not in accumulated_info: accumulated_info[key] = np.zeros_like(value) accumulated_info[key] += value callbacks.on_action_end(action) reward1 += info["r1"] reward2 += info["r2"] reward += info["r1"] + info["r2"] if done: break if nb_max_episode_steps and episode_step >= nb_max_episode_steps - 1: # Force a terminal state. done = True metrics1 = self.agent1.backward(reward1, terminal=done) metrics2 = self.agent2.backward(reward2, terminal=done) episode_reward1 += reward1 episode_reward2 += reward2 step_logs = { 'action': action, 'observation': observation, 'reward': reward, 'metrics': metrics1, # not individual for now 'episode': episode, 'info': accumulated_info, } callbacks.on_step_end(episode_step, step_logs) episode_step += 1 self.agent1.step += 1 self.agent2.step += 1 if done: # We are in a terminal state but the agent hasn't yet seen it. We therefore # perform one more forward-backward call and simply ignore the action before # resetting the environment. We need to pass in `terminal=False` here since # the *next* state, that is the state of the newly reset environment, is # always non-terminal by convention. self.agent1.forward(observation) self.agent2.forward(observation) self.agent1.backward(0., terminal=False) self.agent2.backward(0., terminal=False) # This episode is finished, report and reset. episode_logs = { 'episode_reward': episode_reward1 + episode_reward2, 'nb_episode_steps': episode_step, 'nb_steps': self.agent1.step, # not individual for now } callbacks.on_episode_end(episode, episode_logs) episode += 1 observation = None episode_step = None episode_reward1 = None episode_reward2 = None except KeyboardInterrupt: # We catch keyboard interrupts here so that training can be be safely aborted. # This is so common that we've built this right into this function, which ensures that # the `on_train_end` method is properly called. did_abort = True callbacks.on_train_end(logs={'did_abort': did_abort}) self.agent1._on_train_end() self.agent2._on_train_end() return history
x = Activation('tanh')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) #######################CRITIC-------END###################################### # Create the ddpg agent using the models that are defined above, define learning rate for target networks within and the discount factor, gamma # gamma = 0.9 is the discount factor of future rewards ddpg = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, batch_size=32, nb_steps_warmup_critic=5000, nb_steps_warmup_actor=5000, random_process=random_process, gamma=0.9, target_model_update=5e-3) # .compile() is used to configure the model with losses and metrics. # The learning rate of actor and critic are entered as arguments below respectfully ddpg.compile([Adam(lr=5e-4), Adam(lr=5e-3)], metrics=['mae']) # show the metrics of the model that can be analysed in graphs print(ddpg.metrics_names) # .fit() is used to train the DDPG model # 3000 max steps specified by Christos Kouppas
x = Concatenate()([action_input, flattened_observation]) x = Dense(1, activation='linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) memory = SequentialMemory(limit=1000, window_length=WINDOW_LENGTH) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) agent.fit(train_env, nb_steps=1000, visualize=False, verbose=2, nb_max_episode_steps=100) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format("abc"), overwrite=True)
def train_agent(env, args): from src.Agents import create_ddpg_actor, create_ddpg_critic, ddpg_controls, EnvironmentWrapper from keras.optimizers import Adam from rl.agents.ddpg import DDPGAgent from rl.policy import EpsGreedyQPolicy from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess env = EnvironmentWrapper(ddpg_controls, env) nb_actions = 3 actor = create_ddpg_actor(env) critic, action_input = create_ddpg_critic(env) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=2000, nb_steps_warmup_actor=2000, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=0.5e-2, clipnorm=1.), metrics=['mae']) try: agent.load_weights(args.ai_in) except OSError: pass # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=20000, visualize=False, verbose=2) # After training is done, we save the final weights. agent.save_weights(args.ai_out, overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=1, visualize=False)
env = populate_env(gym.make("MountainCarContinuous-v0")) # Build the actor and the critic actor = simple_actor(env) critic = simple_critic(env) # Memory memory = SimpleMemory(env=env, limit=1000000) # Noise random_process = OrnsteinUhlenbeckProcess(size=env.action_space.dim, theta=.15, mu=0., sigma=3.) # Agent agent = DDPGAgent(experiment=experiment, actor=actor, critic=critic, env=env, memory=memory, random_process=random_process) agent.compile() agent.train(env=env, episodes=10, render=True, verbosity=2, nb_max_episode_steps=1000, plots=False)
# agent # With a model, memory, and policy defined, we’re now ready to create a deep Q network Agent and send that agent those objects. # Keras-RL provides an agent class called DDPG Agent that we can use for this, as shown in the following code: # nb_steps_warmup: Determines how long we wait before we start doing experience replay, which if you recall, is when we actually start training the network. # This lets us build up enough experience to build a proper minibatch. # If you choose a value for this parameter that’s smaller than your batch size, Keras RL will sample with a replacement. # target_model_update: The Q function is recursive and when the agent updates it’s network for Q(s,a) that update also impacts the prediction it will make for # Q(s’, a). This can make for a very unstable network. The way most deep Q network implementations address this limitation is by using a target network, which # is a copy of the deep Q network that isn’t trained, but rather replaced with a fresh copy every so often. The target_model_update parameter controls how often this happens. ddpg = DDPGAgent(nb_actions=num_actions, actor=model, critic=critic, critic_action_input=critic_action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) ddpg.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mae']) ddpg.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200) ddpg.test(env, nb_episodes=5, visualize=True)