def test_ddpg(): # TODO: replace this with a simpler environment where we can actually test if it finds a solution env = gym.make('Pendulum-v0') np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.shape[0] actor = Sequential() actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)]) agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
theta=.15, mu=0., sigma=.1) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, batch_size=64, random_process=random_process, gamma=.98, target_model_update=1e-3, processor=MujocoProcessor()) agent.compile([Adam(lr=5e-4), Adam(lr=1e-3)], metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. save_data_path_local = ENV_NAME + '.json' agent.fit(env, nb_steps=1000000, visualize=False, verbose=1, save_data_path=save_data_path_local, file_interval=10000) # After training is done, we save the final weights. # agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) plot_af(file_path=ENV_NAME + '.json', save_file_name=ENV_NAME + '.png')
x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) #print(critic.summary()) # Set up the agent for training memory = SequentialMemory(limit=1000000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1., batch_size=128) # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1) agent.compile([Nadam(lr=.0001, clipnorm=1.), Nadam(lr=.0001, clipnorm=1.)], metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. if args.train: #------------------------------------------------------------ weights_filename = 'model/ddpg_final.h5f' checkpoint_weights_filename = 'model/ddpg_{step}.h5f' #log_filename = 'model/ddpg_log.json'.format('opensim') callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000)] #callbacks += [FileLogger(log_filename, interval=10000)] #------------------------------------------------------------ agent.load_weights(args.model) agent.fit(env, callbacks=callbacks, nb_steps=nallsteps, visualize=False, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000) # After training is done, we save the final weights.
def create_agent(nb_actions, observation_shape): """构造 ddpg agent""" import os import sys cur_dir = os.path.dirname(os.path.abspath(__file__)) keras_rl = os.path.join(os.path.dirname(cur_dir), 'keras-rl') sys.path.insert(0, keras_rl) from rl.agents import DDPGAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess # 构造 Actor actor = Sequential() actor.add(Flatten(input_shape=(1, ) + observation_shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('tanh')) print(actor.summary()) # 构造 critic action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + observation_shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # 编译模型 memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.6, mu=0, sigma=0.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=10, nb_steps_warmup_actor=10, batch_size=64, random_process=random_process, gamma=.999, target_model_update=1e-3) agent.compile([Adam(lr=.001, clipnorm=1.), Adam(lr=.001, clipnorm=1.)], metrics=['mae']) return agent
def get_agent(env) -> DDPGAgent: """ Generate a `DDPGAgent` instance that represents an agent learned using Deep Deterministic Policy Gradient. The agent has 2 neural networks: an actor network and a critic network. Args: * `env`: An OpenAI `gym.Env` instance Returns: * a `DDPGAgent` instance. """ assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') range_action_input = 0.5 * (env.action_space.high - env.action_space.low) constantBias = 1 lowb = env.action_space.low # actor = Flatten(input_shape=(1,) + env.observation_space.shape)(observation_input) y = Flatten()(observation_input) y = Dense(16)(y) y = BatchNormalization()(y) y = Activation('relu')(y) y = Dense(16)(y) y = BatchNormalization()(y) y = Activation('relu')(y) pht = Dense(1)(y) pht = BatchNormalization()(pht) pht = Activation('tanh')(pht) pht = Lambda(lambda a: (a + K.constant(constantBias)) * K.constant(range_action_input[0]) + K.constant(lowb[0]))(pht) rht = Dense(1)(y) rht = BatchNormalization()(rht) rht = Activation('tanh')(rht) rht = Lambda(lambda a: (a + K.constant(constantBias)) * K.constant(range_action_input[1]) + K.constant(lowb[1]))(rht) axn = Concatenate()([pht, rht]) actor = Model(inputs=observation_input, outputs=axn) flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dense(32)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dense(32)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.5, size=nb_actions) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, gamma=.99, target_model_update=1e-3, random_process=random_process) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) return agent
size=2) # Create the agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, nb_steps_warmup_actor=2048, nb_steps_warmup_critic=1024, target_model_update=1000, gamma=0.9, batch_size=128, memory_interval=2) agent.compile([Adam(lr=3e-5), Adam(lr=3e-3)]) # Start training for 75000 simulation steps agent.fit( env, nb_steps=75000, nb_max_start_steps=0, nb_max_episode_steps=10000, visualize=True, action_repetition=1, verbose=2, log_interval=10000, callbacks=[], ) # Test the agent hist = agent.test(env,
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) #agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) agent.compile('adam', metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=50000, visualize=True, verbose=2, nb_max_episode_steps=200) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
memory = SequentialMemory(limit=10_000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0.0, sigma=0.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=nb_steps_warmup, nb_steps_warmup_actor=nb_steps_warmup, random_process=random_process, gamma=0.9, target_model_update=1e-3) agent.compile(SGD(lr=1e-5, clipvalue=0.001), metrics=['mae']) callbacks = [ ModelIntervalCheckpoint(weights_name + '_{step}.h5f', interval=10_000), TrainEpisodeLogger(), TensorBoard() ] agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1, callbacks=callbacks) agent.save_weights(weights_name + '_final.h5f', overwrite=True) # agent.test(env, nb_episodes=1, visualize=False)
def main(args): sigma, learning_rate, file_prefix = args env = ModifiedArmEnv(visualize=False) input_shape = (1, ) + env.observation_space.shape nb_actions = env.action_space.shape[0] # Create actor and critic networks actor = Sequential() actor.add(Flatten(input_shape=input_shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=input_shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # Set up the agent for training memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=sigma, dt=env.stepsize, size=env.noutput) agent = DDPGAgent( nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1., ) agent.compile(Adam(lr=learning_rate, clipnorm=1.), metrics=['mae']) # Train the model training_history = RewardsLogger() env.reset() agent.fit( env, nb_steps=100000, visualize=False, verbose=1, nb_max_episode_steps=200, log_interval=10000, callbacks=[training_history], ) # Save weights and training history agent.save_weights(file_prefix + '_weights.h5f', overwrite=True) pickledump(training_history, file_prefix + '_training_history.pkl') # Set test parameters test_nb_episodes = 10 test_nb_max_episode_steps = 1000 # Run test test_history = ObservationsLogger() env.reset() agent.test( env, nb_episodes=test_nb_episodes, visualize=False, nb_max_episode_steps=test_nb_max_episode_steps, callbacks=[test_history], ) # Save test history pickledump(test_history, file_prefix + '_test_history.pkl')
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=nb_actions) agent = DDPGAgent(nb_actions=nb_actions, actor=modelA, critic=modelC, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, batch_size=64) #,processor=CircleProcessor) agent.compile(Adam(lr=args.learnRate, clipnorm=1.), metrics=['mae']) if args.reload: agent.load_weights(args.reload) import rl.callbacks class EpisodeLogger(rl.callbacks.Callback): def __init__(self, size_inX, size_outY, size_cells): self.ofile = open(os.path.join(args.saveFolder, "log.csv"), "w") self.cfile = csv.writer(self.ofile) self.cfile.writerow(["episode", "reward"] + ["inX%d" % i for i in range(size_inX)] + ["outY%d" % i for i in range(size_outY)] + ["cell%d" % i for i in range(size_cells)])
def create(self): """Create the agent""" assert len(self.agent_helper.env.action_space.shape) == 1 nb_actions = int(self.agent_helper.env.action_space.shape[0]) # set #nodes and #sfs based on env limits. used for splitting the output layer and action processor num_nodes = self.agent_helper.env.env_limits.MAX_NODE_COUNT num_sfcs = self.agent_helper.env.env_limits.MAX_SF_CHAIN_COUNT num_sfs = self.agent_helper.env.env_limits.MAX_SERVICE_FUNCTION_COUNT # create the actor NN observation_input = Input( shape=(1, ) + self.agent_helper.env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) prev_layer = flattened_observation # create hidden layers according to config for num_hidden in self.agent_helper.config['actor_hidden_layer_nodes']: hidden_layer = Dense( num_hidden, activation=self.agent_helper. config['actor_hidden_layer_activation'])(prev_layer) prev_layer = hidden_layer # split output layer into separate parts for each node and SF and apply softmax individually out_parts = [ Dense(num_nodes, activation='softmax')(prev_layer) for _ in range(num_nodes * num_sfs) ] out = Concatenate()(out_parts) # normal output layer # out = Dense(nb_actions, activation='tanh')(prev_layer) actor = Model(inputs=observation_input, outputs=out) # create the critic NN action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input( shape=(1, ) + self.agent_helper.env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) prev_layer = Concatenate()([action_input, flattened_observation]) # create hidden layers according to config for num_hidden in self.agent_helper.config[ 'critic_hidden_layer_nodes']: hidden_layer = Dense( num_hidden, activation=self.agent_helper. config['critic_hidden_layer_activation'])(prev_layer) prev_layer = hidden_layer out_critic = Dense(1, activation='linear')(prev_layer) critic = Model(inputs=[action_input, observation_input], outputs=out_critic) # write NN summary to string actor_summary_lst = [] actor.summary(print_fn=actor_summary_lst.append) actor_summary = "".join(actor_summary_lst) actor.summary(print_fn=logger.debug) # write NN summary to string critic_summary_lst = [] critic.summary(print_fn=critic_summary_lst.append) critic_summary = "".join(critic_summary_lst) critic.summary(print_fn=logger.debug) # This following line is causing aliasing issues. Ex: 'nb_observation' is added to agent_config self.agent_helper.result.agent_config = copy.copy( self.agent_helper.config) # Set agent params in result file self.agent_helper.result.agent_config[ 'nb_observation'] = self.agent_helper.env.observation_space.shape[ 0] self.agent_helper.result.agent_config['nb_actions'] = nb_actions self.agent_helper.result.agent_config['actor'] = {} self.agent_helper.result.agent_config['actor'][ 'summary'] = actor_summary self.agent_helper.result.agent_config['critic'] = {} self.agent_helper.result.agent_config['critic'][ 'summary'] = critic_summary self.agent_helper.result.agent_config['metrics'] = ['mae'] # creating the Agent processor = ActionScheduleProcessor(num_nodes=num_nodes, num_sfcs=num_sfcs, num_sfs=num_sfs) memory = SequentialMemory( limit=self.agent_helper.config['mem_limit'], window_length=self.agent_helper.config['mem_window_length']) random_process = GaussianWhiteNoiseProcess( sigma=self.agent_helper.config['rand_sigma'], mu=self.agent_helper.config['rand_mu'], size=nb_actions) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=self.agent_helper. config['nb_steps_warmup_critic'], nb_steps_warmup_actor=self.agent_helper. config['nb_steps_warmup_actor'], random_process=random_process, gamma=self.agent_helper.config['gamma'], target_model_update=self.agent_helper. config['target_model_update'], processor=processor, batch_size=64) agent.compile(Adam( lr=self.agent_helper.config['learning_rate'], decay=self.agent_helper.config['learning_rate_decay']), metrics=['mae']) self.agent = agent
x = Dense(64)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, random_process=random_process, gamma=1, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1., decay=0.9999), metrics=['mae']) #%% ''' the test before warm_up ''' history = agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=1000) # sio.savemat('test-before-train-' + ENV_NAME + '-' + nowtime + '.mat', history.history) before = history.history['episode_reward'] ''' warm_up ''' history = agent.warm_fit(env, policy, policy_list, nb_steps=3e6, visualize=False, log_interval=1000, verbose=2, nb_max_episode_steps=2000) sio.savemat('warm-up-' + ENV_NAME + '-' + nowtime + '.mat', history.history) agent.save_weights('ddpg_{}_weights_after_warm_start.h5f'.format(ENV_NAME), overwrite=True) '''
def main(layers1=[200], layers2=[200], leaky_alpha=0.10, ENV_NAME='EnvPong', show=False, wall_reward=-0.1, touch_reward=0.3, n_steps=80000, n_alternances=10, L_R=0.0001, only_test=False, opp_aware=[1, 1], myopie=[0.00, 0.00], ball_speed=1.0, weights1_name='', weights2_name=''): ENV_NAME = ENV_NAME conf_name = "{}_layers1={}__layers2={}__leaky={}__lr={}__opp={}__myopia={}__speed={}".format( ENV_NAME, layers1, layers2, leaky_alpha, L_R, opp_aware, myopie, ball_speed) #gym.undo_logger_setup() # Get the environment and extract the number of actions. if ENV_NAME == 'Env2D': env = Game2D(2.) elif ENV_NAME == 'Env2DSoloSpin': env = Game2DSolo(2., spinRacket=True) elif ENV_NAME == 'Env3DSolo': env = Game3DSolo(2., 9.8, 0.5, 7., 3.) elif ENV_NAME == 'EnvPong': env = Pong(PongPlayer(None, opp_aware=(opp_aware[0] == 1)), PongPlayer(None, opp_aware=(opp_aware[1] == 1))) np.random.seed(123) #env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space_1.shape)) #actor.add(keras.layers.normalization.BatchNormalization()) for size in layers1: actor.add( Dense(size, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None))) #actor.add(keras.layers.core.Dropout(0.2)) actor.add(LeakyReLU(leaky_alpha)) #actor.add(keras.layers.normalization.BatchNormalization()) actor.add( Dense(nb_actions, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None), bias_regularizer=regularizers.l2(0.01))) #actor.add(keras.layers.core.Dropout(0.2)) actor.add(Activation('linear')) print(actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space_1.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = merge([action_input, flattened_observation], mode='concat') #x = keras.layers.normalization.BatchNormalization()(x) for size in layers1: x = Dense(size)(x) #x = keras.layers.core.Dropout(0.2)(x) x = LeakyReLU(alpha=leaky_alpha)(x) #x = keras.layers.normalization.BatchNormalization()(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) actor2 = Sequential() actor2.add(Flatten(input_shape=(1, ) + env.observation_space_2.shape)) #actor2.add(keras.layers.normalization.BatchNormalization()) for size in layers2: actor2.add( Dense(size, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None))) #actor2.add(keras.layers.core.Dropout(0.2)) actor2.add(LeakyReLU(alpha=leaky_alpha)) actor2.add( Dense(nb_actions, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None), bias_regularizer=regularizers.l2(0.01))) #actor2.add(keras.layers.core.Dropout(0.2)) actor2.add(Activation('linear')) print(actor2.summary()) action_input2 = Input(shape=(nb_actions, ), name='action_input') observation_input2 = Input(shape=(1, ) + env.observation_space_2.shape, name='observation_input') flattened_observation2 = Flatten()(observation_input2) x2 = merge([action_input2, flattened_observation2], mode='concat') #x2 = keras.layers.normalization.BatchNormalization()(x2) for size in layers2: x2 = Dense(size)(x2) #x2 = keras.layers.core.Dropout(0.2)(x2) x2 = LeakyReLU(alpha=leaky_alpha)(x2) x2 = Dense(1)(x2) x2 = Activation('linear')(x2) critic2 = Model(input=[action_input2, observation_input2], output=x2) print(critic2.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory1 = SequentialMemory(limit=50000, window_length=1) if opp_aware[0] != opp_aware[1]: memory2 = SequentialMemory(limit=50000, window_length=1) else: memory2 = memory1 random_process1 = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.1, mu=0., sigma=.15, sigma_min=0., n_steps_annealing=n_steps / 4) # Explores less at the end ? random_process2 = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.1, mu=0., sigma=.15, sigma_min=0., n_steps_annealing=4 * n_steps) agent1 = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory1, nb_steps_warmup_critic=5000, nb_steps_warmup_actor=5000, random_process=random_process1, gamma=.99, target_model_update=1e-3, batch_size=100) agent2 = DDPGAgent(nb_actions=nb_actions, actor=actor2, critic=critic2, critic_action_input=action_input2, memory=memory2, nb_steps_warmup_critic=5000, nb_steps_warmup_actor=5000, random_process=random_process2, gamma=.99, target_model_update=1e-3, batch_size=100) #agent.compile(Adam(lr=L_R, clipnorm=1., clipvalue=0.5), metrics=['mae']) agent1.compile(Adam(lr=L_R, clipnorm=1.), metrics=['mae']) agent2.compile(Adam(lr=L_R, clipnorm=1.), metrics=['mae']) player1 = PongPlayer(agent1, myopie=myopie[0], opp_aware=(opp_aware[0] == 1)) player2 = PongPlayer(agent2, myopie=myopie[1], opp_aware=(opp_aware[1] == 1)) # Grid -4 # Add -1 when lost # CEM method directory_log = "logs/ddpg/{}".format(conf_name) directory_weights = "weights/ddpg/{}".format(conf_name) if not os.path.exists(directory_log): os.makedirs(directory_log) if not os.path.exists(directory_weights): os.makedirs(directory_weights) if only_test: '''if weights1_name =='': weights1_name = "{}/player1_final".format(directory_weights) if weights2_name == '': weights2_name = "{}/player2_final".format(directory_weights) #if os.path.isfile(weights1_name) and os.path.isfile(weights2_name): agent1.load_weights(weights1_name) agent2.load_weights(weights2_name)''' agent1.load_weights("{}/player1_{}".format(directory_weights, "final")) agent2.load_weights("{}/player1_{}".format(directory_weights, "final")) env = makeEnv(player1, player2, ENV_NAME, ball_speed=ball_speed) for i in range(10): playPong(env) confrontPlayers(env) plotStrategy(env) else: for i in range(n_alternances): print "Alternance n {} \n".format(i) def learning_rate_schedule(epoch): return L_R if ENV_NAME == 'Env2D': env = Game2D(agent2, wall_reward=wall_reward, touch_reward=touch_reward) elif ENV_NAME == 'EnvPong': env = Pong(player1, player2, wall_reward=wall_reward, touch_reward=touch_reward, ball_speed=ball_speed) agent1.fit(env, nb_steps=n_steps, visualize=False, verbose=1, until_score=True, score_to_reach=0.5, last_episodes=500, nb_max_episode_steps=None, callbacks=[ FileLogger("{}/player1_{}.h5f".format( directory_log, i)), keras.callbacks.LearningRateScheduler( learning_rate_schedule) ]) agent1.test(env, nb_episodes=100, visualize=False, nb_max_episode_steps=500, verbose=1) agent1.save_weights("{}/player1_{}".format(directory_weights, i), overwrite=True) agent1.memory = SequentialMemory(limit=500000, window_length=1) wall_reward = wall_reward * 0.8 touch_reward = touch_reward * 0.8 agent2.load_weights("{}/player1_{}".format(directory_weights, i)) print "Fin de {}".format(conf_name) env = Pong(player1, player2, wall_reward=wall_reward, touch_reward=touch_reward, ball_speed=ball_speed) #agent1.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=None,callbacks=[FileLogger("logs/ddpg/{}_weights_steps_leaky_reg_bias_drop_lr{}.h5f".format(ENV_NAME,L_R), interval=100)]) agent1.save_weights("{}/player1_final".format(directory_weights), overwrite=True) agent2.save_weights("{}/player2_final".format(directory_weights), overwrite=True) agent1.test(env, nb_episodes=15, visualize=False, nb_max_episode_steps=500, verbose=2) if show == True: if ENV_NAME == 'Env2D': for i in range(10): play2D(player1=agent1, player2=agent1) elif ENV_NAME == 'EnvPong': for i in range(10): playPong(left=agent1, right=agent2)
class RLAgent: def __init__(self): ENV_NAME = 'drone' # Get the environment and extract the number of actions. #env = gym.make(ENV_NAME) env = drone_sim() np.random.seed(123) env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # Next, we build a very simple model. self.actor = Sequential() self.actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) self.actor.add(Dense(16)) self.actor.add(Activation('relu')) self.actor.add(Dense(16)) self.actor.add(Activation('relu')) self.actor.add(Dense(16)) self.actor.add(Activation('relu')) self.actor.add( Dense(nb_actions, activation='tanh', kernel_initializer=RandomUniform())) self.actor.add(Lambda(lambda x: x * 60.0)) print(self.actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) self.agent = DDPGAgent(nb_actions=nb_actions, actor=self.actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_range=(-100., 100.)) # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1) #agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) agent.compile([RMSprop(lr=.001), RMSprop(lr=.001)], metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. if args.train: agent.fit(env, nb_steps=nallsteps, visualize=True, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000) # After training is done, we save the final weights. agent.save_weights(args.output, overwrite=True) if not args.train: agent.load_weights(args.output) # Finally, evaluate our algorithm for 5 episodes. if args.env != "Arm": agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=500)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=HYP.THETA, mu=HYP.MU, sigma=HYP.SIGMA) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, batch_size=HYP.BATCH_SIZE, nb_steps_warmup_actor=HYP.WARMUP_ACTOR, nb_steps_warmup_critic=HYP.WARMUP_CRITIC, random_process=random_process, gamma=HYP.GAMMA, target_model_update=HYP.TAU) agent.compile(Adam(lr=HYP.LEARN_R, clipnorm=HYP.CLIPNORM), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for # show, but this slows down training quite a lot. You can always safely abort # the training prematurely using Ctrl + C. agent.fit(env, nb_steps=HYP.NB_STEPS, visualize=False, callbacks=[file_logger], verbose=HYP.VERBOSE) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env,
class PartnerApproximatingLearner(Controller): """ A controller that learns how the other agent behaves and adapts to that behavior """ def __init__(self, first_player: bool, stop_ident_time=1e9, do_rl=False, learning_rate=0.01, activation_fcn='relu', learn_time_delta=0.2, rl_time_delta=0.1, epochs=2, fit_batch_size=20, learn_stack=LearningStack(), real_env=CoopPendulum(), rl_memory_span=50, wolf=0., win_lr_reduction=1, wolf_stop_rl=False): """ Sets various parameters, configures the ident, actor and critic NN and compiles the agent""" super(PartnerApproximatingLearner, self).__init__( first_player) # Call to __init__ of parent class Controller self.learn_stack = learn_stack # Controller specific LearningStack in which to save the experiences self.loosing_lr = learning_rate self.rl_lr = .001 # hyper-parameter self.win_lr_reduction = win_lr_reduction self.wolf = wolf self.wolf_stop_rl = wolf_stop_rl seed = np.random.randint(0, int(1e6)) + int( first_player ) * 100 # -> first player gets different seed than second # Configure neural network for identification: num_hidden_layer_ident = 3 num_neurons_per_layer_ident = 16 act_space_shape = real_env.action_space.shape obs_space_shape = real_env.observation_space.shape ident_nn = Sequential() ident_nn.add( Dense(num_neurons_per_layer_ident, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed), input_shape=obs_space_shape)) for i in range(num_hidden_layer_ident - 1): # Add the layers to the identification NN ident_nn.add( Dense(num_neurons_per_layer_ident, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + i))) ident_nn.add(Activation(activation_fcn)) ident_nn.add( Dense(act_space_shape[0], kernel_initializer=RandomUniform(minval=-0.0001, maxval=0.0001, seed=seed + 9))) ident_nn.add(Activation('linear')) opt = Adam(lr=learning_rate) # hyper-parameter ident_nn.compile(optimizer=opt, loss='mse') # hyper-parameter # Use the neural network inside a NNController for easy evaluation of the output: self.ident_ctrl = StaticNNController( first_player=(not self.first_player), neural_net=ident_nn) # Set other identification parameters self.ident_time_delta = learn_time_delta # simulation time between training the other_model with experience self.last_ident_time = 0 # last time ident NN was trained self.epochs = epochs # number of training epochs when its time to identify again self.fit_batch_size = fit_batch_size # size of mini batch that the batch is split into for training by Keras self.stop_ident_time = stop_ident_time # Time at which no training should occur anymore. Used for testing self.do_rl = do_rl if do_rl: self.rl_env = deepcopy(real_env) self.last_rl_time = -1 self.rl_time_delta = rl_time_delta self.rl_env.set_ctrl_other(self.ident_ctrl) try: self.u_limit = self.rl_env.action_space_u1 if first_player else self.rl_env.action_space_u2 except AttributeError: # rl_env does not have individual limits self.u_limit = self.rl_env.action_space # Configure the Neural Networks of the RL-agent # 1. Actor: rl_num_hidden_layer_actor = 3 rl_num_neurons_per_layer_actor = 16 rl_actor = Sequential( ) # Actor is a Sequential Neural Network (MLP) rl_actor.add(Flatten(input_shape=(1, ) + obs_space_shape)) for i in range(rl_num_hidden_layer_actor ): # Add the layers to the actor NN rl_actor.add( Dense(rl_num_neurons_per_layer_actor, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + 10 + i))) rl_actor.add(Activation(activation_fcn)) rl_actor.add( Dense(act_space_shape[0], kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + 19))) rl_actor.add(Activation('linear')) # 2. Critic: rl_num_hidden_layer_critic = 3 rl_num_neurons_per_layer_critic = 32 action_input = Input(shape=act_space_shape, name='action_input') observation_input = Input(shape=(1, ) + obs_space_shape, name='observation_input') flattened_observation = Flatten()(observation_input) rl_critic_nn = Concatenate()([action_input, flattened_observation]) for i in range(rl_num_hidden_layer_critic): rl_critic_nn = Dense(rl_num_neurons_per_layer_critic, kernel_initializer=RandomUniform( minval=-1, maxval=1, seed=seed + 20 + i))(rl_critic_nn) rl_critic_nn = Activation(activation_fcn)(rl_critic_nn) rl_critic_nn = Dense( 1, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + 29))(rl_critic_nn) rl_critic_nn = Activation('linear')(rl_critic_nn) rl_critic = Model(inputs=[action_input, observation_input], outputs=rl_critic_nn) # 3. Set training parameters for the Agent and compile it rl_frames_per_train = 200 rl_mem_size = int( rl_memory_span * (round(1 / self.rl_time_delta) * rl_frames_per_train)) rl_memory = SequentialMemory(limit=rl_mem_size, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=act_space_shape[0], theta=.15, mu=0., sigma=.3) self.rl_agent = DDPGAgent(nb_actions=act_space_shape[0], actor=rl_actor, critic=rl_critic, critic_action_input=action_input, memory=rl_memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) self.rl_agent.compile(Adam(lr=self.rl_lr, clipnorm=1.), metrics=['mae']) self.rl_actor_ctrl = StaticNNController( first_player=self.first_player, neural_net=rl_actor) def ident_other(self): """ Updates Identification of the partner """ batch = self.learn_stack.pick_random( ) # get a batch from the LearningStack batch_t, batch_x, batch_u = zip(*batch) batch_u1, batch_u2 = zip(*batch_u) inputs = np.asarray(batch_x) if self.first_player: # -> player 2 has to be identified outputs = np.reshape(np.array(batch_u2), (-1, 1)) else: # -> player 1 has to be identified outputs = np.reshape(np.array(batch_u1), (-1, 1)) self.ident_ctrl.neural_net.fit(inputs, outputs, batch_size=self.fit_batch_size, epochs=self.epochs, verbose=0, shuffle=True, validation_split=0.) def u(self, t, x) -> float: """ Calculates the control variable u of the learning controller (only for the "real" environment) The action inside the internal simulation is calculated by the actor NN and clipped by the env """ if self.do_rl: u_self = self.rl_actor_ctrl.u(t, x) u_self = min(max(u_self, np.ndarray.item(self.u_limit.low)), np.ndarray.item(self.u_limit.high)) else: u_self = 0. return u_self def get_other_pred(self, t, x): """ Returns the expected output of the other controller for the given input """ u_other_pred = self.ident_ctrl.u(t, x) return u_other_pred def calc_error_on_learning_stack(self): stack = self.learn_stack.get_all_experiences() stack_t, stack_x, stack_u = zip(*stack) stack_u1, stack_u2 = zip(*stack_u) predictions = list() for i in range(len(stack_x)): predictions.append(self.get_other_pred(stack_t[i], stack_x[i])) assert len(predictions) == len(stack_u1) assert len(predictions) == len(stack_u2) if self.first_player: # Predicting Player 2 error = [(stack_u2[j] - predictions[j])**2 for j in range(len(predictions))] else: # Predicting Player 1 error = [(stack_u1[j] - predictions[j])**2 for j in range(len(predictions))] mse = sum(error) / len(error) return mse def new_exp(self, exp): """ Saves the new experience (time, state, control variables) on the stack and triggers rl/ident if enough time passed """ self.learn_stack.add(exp[0:3]) t_now = exp[0] winning = False if len( exp ) > 3: # if "real" reward is supplied: check if within winning limits winning = exp[3] > self.wolf # hyper-parameter if self.do_rl and round( t_now - self.last_rl_time, 5) >= self.rl_time_delta: # enough time passed since last RL if winning: K.set_value(self.rl_agent.actor_optimizer.lr, self.rl_lr / self.win_lr_reduction) K.set_value(self.rl_agent.critic.optimizer.optimizer.lr, self.rl_lr / self.win_lr_reduction) else: K.set_value(self.rl_agent.actor_optimizer.lr, self.rl_lr) K.set_value(self.rl_agent.critic.optimizer.optimizer.lr, self.rl_lr) if not (self.wolf_stop_rl and winning): self.improve_policy() self.last_rl_time = t_now if round(t_now - self.last_ident_time, 5) >= self.ident_time_delta and t_now < self.stop_ident_time: if winning: K.set_value(self.ident_ctrl.neural_net.optimizer.lr, self.loosing_lr / self.win_lr_reduction) else: K.set_value(self.ident_ctrl.neural_net.optimizer.lr, self.loosing_lr) self.ident_other( ) # train my model of the other controller on data from my LearningStack self.last_ident_time = t_now def improve_policy(self): """ Does an episode of RL to improve critic and actor of the rl_agent """ self.rl_agent.fit(self.rl_env, nb_steps=200, visualize=False, verbose=0, nb_max_episode_steps=200)
x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0.1, sigma=0.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=0.001), metrics=['mae']) #agent.compile(Adam(lr=0.001), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=473780) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) # print(actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) if REWARD == "normal": ddpg_normal = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) ddpg_normal.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. history_normal = ddpg_normal.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=200) # After training is done, we save the final weights. ddpg_normal.save_weights(os.path.join( LOG_DIR, 'ddpg_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. ddpg_normal.test(env, nb_episodes=5, visualize=False, verbose=2, nb_max_episode_steps=200) pandas.DataFrame(history_normal.history).to_csv( os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": processor_noisy = PendulumSurrogateProcessor(weight=WEIGHT, surrogate=False, noise_type=NOISE_TYPE) ddpg_noisy = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=processor_noisy) ddpg_noisy.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) history_noisy = ddpg_noisy.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=200) ddpg_noisy.save_weights(os.path.join( LOG_DIR, 'ddpg_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) ddpg_noisy.test(env, nb_episodes=5, visualize=False, verbose=2, nb_max_episode_steps=200) pandas.DataFrame(history_noisy.history).to_csv( os.path.join(LOG_DIR, "noisy.csv")) elif REWARD == "surrogate": processor_surrogate = PendulumSurrogateProcessor(weight=WEIGHT, surrogate=True, noise_type=NOISE_TYPE) ddpg_surrogate = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=processor_surrogate) ddpg_surrogate.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) history_surrogate = ddpg_surrogate.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=200) ddpg_surrogate.save_weights(os.path.join( LOG_DIR, 'ddpg_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) ddpg_surrogate.test(env, nb_episodes=5, visualize=False, verbose=2, nb_max_episode_steps=200) pandas.DataFrame(history_surrogate.history).to_csv( os.path.join(LOG_DIR, "surrogate.csv")) else: raise NotImplementedError
def __init__(self, env: gym.Env, logger=Logger(), n_layers_actor=3, n_units_actor=16, n_layers_critic=3, n_units_critic=32, sigma_decay=1, sigma=0.3): nb_actions = env.action_space.shape[0] ### # obs_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') # x = Flatten()(obs_input) # x = Dense(units=256, activation='relu')(x) # # action_input = Input(shape=(nb_actions,), name='action_input') # x_c = Concatenate()([x, action_input]) # # x_critic = Dense(units=128, activation='relu')(x_c) # q_value = Dense(units=1)(x_critic) # # x_actor = Dense(units=128, activation='relu')(x) # action = Dense(units=nb_actions, activation='tanh')(x_actor) # # actor = Model(inputs=obs_input, outputs = action) # critic = Model(inputs=[action_input, obs_input], outputs = q_value) obs_input_actor = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') x_ac = Flatten()(obs_input_actor) x_ac = Dense(units=256, activation='relu')(x_ac) obs_input_critic = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') x_cr = Flatten()(obs_input_critic) x_cr = Dense(units=256, activation='relu')(x_cr) action_input = Input(shape=(nb_actions, ), name='action_input') x_cr = Concatenate()([x_cr, action_input]) x_critic = Dense(units=128, activation='relu')(x_cr) q_value = Dense(units=1)(x_critic) x_actor = Dense(units=128, activation='relu')(x_ac) action = Dense(units=nb_actions, activation='tanh')(x_actor) actor = Model(inputs=obs_input_actor, outputs=action) critic = Model(inputs=[action_input, obs_input_critic], outputs=q_value) # actor = Sequential() # actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) # for i in range(n_layers_actor): # actor.add(Dense(n_units_actor)) # #actor.add(BatchNormalization()) # actor.add(Activation('relu')) # #actor.add(LeakyReLU()) # actor.add(Dense(nb_actions)) # actor.add(Activation('tanh')) # # action_input = Input(shape=(nb_actions,), name='action_input') # observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') # flattened_observation = Flatten()(observation_input) # x = Concatenate()([action_input, flattened_observation]) # for i in range(n_layers_critic): # x = Dense(n_units_critic)(x) # #x = BatchNormalization()(x) # x = Activation('relu')(x) # #x = LeakyReLU()(x) # x = Dense(1)(x) # x = Activation('linear')(x) # critic = Model(inputs=[action_input, observation_input], outputs=x) # # action_input = Input(shape=(nb_actions,), name='action_input') # observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') # flattened_observation = Flatten()(observation_input) # xo = Dense(n_units_critic, activation='relu')(flattened_observation) # #xo = Dense(n_units_critic, activation='relu')(xo) # x = Concatenate()([xo, action_input]) # for i in range(n_layers_critic-1): # x = Dense(n_units_critic, activation='relu')(x) # x = Dense(1)(x) # x = Activation('linear')(x) # critic = Model(inputs=[action_input, observation_input], outputs=x) memory = SequentialMemory(limit=1000000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=sigma) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3, batch_size=64, train_interval=4) agent.compile([Adam(lr=.0001, clipnorm=1.), Adam(lr=.0001)], metrics=['mae']) self.agent = agent self.env = env self.sigma_decay = sigma_decay super().__init__(env, logger)
class KerasDDPGAgent(KerasAgent): """ An DDPG agent using Keras library with Keras RL. For more details about Deep Deterministic Policy Gradient algorithm, check "Continuous control with deep reinforcement learning" by Lillicrap. https://arxiv.org/abs/1509.02971 """ def __init__(self, observation_space, action_space, filename='KerasDDPGAgent.h5f'): nb_actions = action_space.shape[0] # Actor network actor = Sequential() actor.add(Flatten(input_shape=(1, ) + observation_space.shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) print(actor.summary()) # Critic network action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Setup Keras RL's DDPGAgent memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=nb_actions) self.agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1.) self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) self.filename = filename
x = Dense(400)(flattened_observation) x = Activation('relu')(x) x = Concatenate()([x, action_input]) x = Dense(300)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=MujocoProcessor()) agent.compile([Adam(learning_rate=1e-4), Adam(learning_rate=1e-3)], metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=1000000, visualize=False, verbose=1) # After training is done, we save the final weights. agent.save_weights(f'ddpg_{ENV_NAME}_weights.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
random_process = GaussianWhiteNoiseProcess(mu=0.0, sigma=0.8, sigma_min=0.05, n_steps_annealing=650000) # Create the agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, nb_steps_warmup_actor=32, nb_steps_warmup_critic=32, target_model_update=1e-4, gamma=0.9, batch_size=32) agent.compile(Adam(lr=1e-4), metrics=['mae']) # Start training for 7.5M simulation steps (1.5M training steps with actions repeated 5 times) agent.fit(env, nb_steps=1500000, visualize=False, action_repetition=5, verbose=2, nb_max_start_steps=0, log_interval=10000, callbacks=[]) # Test the agent hist = agent.test(env, nb_episodes=10, action_repetition=1, visualize=True)
size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1.) # warmup? delta_clip? # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) #critic learning rate? # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. if args.train: agent.fit(env, nb_steps=nallsteps, visualize=False, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000) # After training is done, we save the final weights. agent.save_weights(args.model, overwrite=True) # If TEST and TOKEN, submit to crowdAI
#setup agent, using defined keras model alog with the policy and actions from above #Discrete actions: policy = EpsGreedyQPolicy() testPolicy = GreedyQPolicy() #agent = DQNAgent(model=actorModel, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, policy=policy, test_policy=testPolicy) #continuous actions: random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(actor=actorModel, critic=criticModel, nb_actions=nb_actions, memory=memory, nb_steps_warmup_actor=100, nb_steps_warmup_critic=100, critic_action_input=action_input, random_process=random_process) #compile model agent.compile(Nadam(lr=1e-3, clipnorm=0.1), metrics=['mae']) # Okay, now it's time to learn something! # We visualize the training here for show, but this slows down training quite a lot. agent.fit(env, nb_steps=50000, visualize=True, verbose=2) #TEST! #blockingVar = input('Press a key!: ') agent.test(env, nb_episodes=5, visualize=True)
if(args.PER==False): memory = NonSequentialMemory(limit=args.memory_size, window_length=1) elif(args.PER==True): memory = PrioritisedNonSequentialMemory(limit=args.memory_size, alpha=args.alpha, beta=args.beta, window_length=1) ## 'proportional' priority replay implementation else: print("\nRun vanilla_keras_rl/keras-rl/examples/ddpg_mujoco.py for no PER or HER!") sys.exit(1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) ## WARNING: make sure memory_interval is 1 for HER to work agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, pretanh_model=pretanh_model ,critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, batch_size=args.batch_size, delta_clip=args.delta_clip, random_process=random_process, gamma=args.gamma, target_model_update=args.soft_update, do_HER=args.HER, K=args.K, HER_strategy=args.her_strategy, do_PER=args.PER, epsilon=1e-4, processor=MujocoProcessor(), pretanh_weight=args.pretanh_weight) agent.compile([Adam(lr=args.actor_lr, clipnorm=args.actor_gradient_clip), Adam(lr=args.critic_lr, clipnorm=args.critic_gradient_clip)], metrics=['mae']) if(args.HER==True and args.PER==False): print("\nTraining with Hindsight Experience Replay\n") save_data_path_local = 'HER/'+args.ENV_NAME+'.json' elif(args.HER==False and args.PER==True): print("\nTraining with Prioritised Experience Replay\n") save_data_path_local = 'PER/'+args.ENV_NAME+'.json' elif(args.HER==True and args.PER==True): print("\nTraining with Prioritised Hindsight Experience Replay\n") save_data_path_local = 'PHER/'+args.ENV_NAME+'.json' if(args.train): """ Start Training (You can always safely abort the training prematurely using Ctrl + C, *once* ) """ agent.fit(env, nb_steps=args.nb_train_steps, visualize=False, verbose=1, save_data_path=save_data_path_local, file_interval=args.file_interval, nb_max_episode_steps=args.max_step_episode)
def main(): set_gpu_option() # OPTIONS ENV_NAME = 'DDPGEnv-v0' TIME_STEP = 30 # Get the environment and extract the number of actions. PATH_TRAIN = '/home/data/training_x_150.h5' PATH_TEST = '/home/data/test_x_150.h5' """ env = OhlcvEnv(TIME_STEP, path=PATH_TRAIN) env_test = OhlcvEnv(TIME_STEP, path=PATH_TEST) """ store = pd.HDFStore(PATH_TRAIN, mode='r') varieties_list = store.keys() print('varieties_list: ', varieties_list) print('num varieties: ', len(varieties_list)) variety = 'RB' print('variety: ', variety) # get selected features SELECTED_FACTOR_PATH = '~/feature_selection/根据互信息选出的特征,根据重要性排序.csv' selected_factor_df = pd.read_csv(SELECTED_FACTOR_PATH, index_col=0) selected_factor_list = selected_factor_df[variety].to_list() env = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TRAIN, selected_factor_list=selected_factor_list) #env_test = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TEST, selected_factor_list=selected_factor_list) # random seed np.random.seed(123) env.seed(123) nb_actions = env.action_space.shape[0] print('nb_actions: ', nb_actions) print('env.observation_space.shape: ', env.observation_space.shape) print('env.observation_space: ', env.observation_space) # create actor actor = create_actor(input_shape=env.shape, nb_actions=nb_actions) # create critic action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=env.shape, name='observation_input') critic = create_critic(action_input, observation_input) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and even the metrics! memory = SequentialMemory(limit=50000, window_length=TIME_STEP) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) ddpg = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=DDPGProcessor()) ddpg.compile(optimizer=Adam(lr=1e-3), metrics=['mae']) log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_grads=True) for _ in range(3): ddpg.fit(env, nb_steps=140000, nb_max_episode_steps=140000, visualize=False, verbose=2) """
theta=0.15, mu=0.0, sigma=0.3) agent = DDPGAgent( nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=0.99, target_model_update=1e-3, ) agent.compile(Adam(lr=0.001, clipnorm=1.0), metrics=["mae"]) # # Okay, now it's time to learn something! We visualize the training here for show, but this # # slows down training quite a lot. You can always safely abort the training prematurely using # # Ctrl + C. agent.fit(env, nb_steps=100000, visualize=False, verbose=1, nb_max_episode_steps=288) # # After training is done, we save the final weights. agent.save_weights("ddpg_{}_weights.h5f".format(ENV_NAME), overwrite=True) # # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=288)
sigma_min=0.01, n_steps_annealing=2900000) # define the DDPG agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, batch_size=64, memory=memory, nb_steps_warmup_critic=3000, nb_steps_warmup_actor=3000, random_process=random_process, gamma=GAMMA, target_model_update=1e-4) # compile the model agent.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mse']) callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre, filename_exp) # ---------------------------------------------------------------------------------------------------------------------------------------- # Training phase # fitting the agent # agent.fit(env, nb_steps=3000000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=STEPS_PER_EPISODE,process_noise_std=process_noise_std) # After training is done, we save the final weights. # agent.save_weights(log_filename_pre+filename_exp+'/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # common_func.save_process_noise(ENV_NAME, log_filename_pre, filename_exp, process_noise_std, theta) #---------------------------------------------------------------------------------------------------------------------------------------
x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200) agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
def get_agent(env, agent_id, model=1): global observation_size # Count number of actions if not ingy: nb_actions = env.action_space['action_movement'][0].shape[0] + 2 # Count number of observations for input if observation_size == 0: observation_size += env.observation_space[ 'observation_self'].shape[0] observation_size += env.observation_space['agent_qpos_qvel'].shape[0] * \ env.observation_space['agent_qpos_qvel'].shape[1] observation_size += env.observation_space['box_obs'].shape[ 0] * env.observation_space['box_obs'].shape[1] observation_size += env.observation_space['ramp_obs'].shape[ 0] * env.observation_space['ramp_obs'].shape[1] # TODO: Not sure whether to include mask_a*_obs and mask_ab_obs_spoof in this observation input -AH else: nb_actions = env.action_space.spaces['action_movement'].spaces[ 0].shape[0][0] + 2 # Count number of observations for input if observation_size == 0: observation_size += env.observation_space.spaces[ 'observation_self'].shape[0] if 'lidar' in env.observation_space.spaces: observation_size += env.observation_space.spaces[ 'lidar'].shape[0] observation_size += env.observation_space.spaces['agent_qpos_qvel'].shape[0] * \ env.observation_space.spaces['agent_qpos_qvel'].shape[1] observation_size += env.observation_space.spaces['box_obs'].shape[0] * \ env.observation_space.spaces['box_obs'].shape[1] observation_size += env.observation_space.spaces['ramp_obs'].shape[0] * \ env.observation_space.spaces['ramp_obs'].shape[1] if model == 1: # Build the actor model actor = Sequential() actor.add(Flatten(input_shape=( 1, observation_size, ))) actor.add(Dense(400)) actor.add(Activation('relu')) actor.add(Dense(300)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) # Return values from 0 to 1 # print(actor.summary()) # Build the critic model action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=( 1, observation_size, ), name='observation_input') flattened_observation = Flatten()(observation_input) x = Dense(400)(flattened_observation) x = Activation('relu')(x) x = Concatenate()([x, action_input]) x = Dense(300)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) # Build the agent memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=2.15, mu=0, sigma=3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=4000, nb_steps_warmup_actor=4000, random_process=random_process, gamma=.9, target_model_update=1e-3, processor=MujocoProcessor(agent_id)) agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) elif model == 2: # Build the actor model actor = Sequential() actor.add(Flatten(input_shape=( 1, observation_size, ))) actor.add(Dense(400)) actor.add(Activation('relu')) actor.add(Dense(300)) actor.add(Dropout(0.3)) actor.add(Activation('relu')) actor.add(Dense(100)) actor.add(Dropout(0.2)) actor.add(Activation('elu')) actor.add(Dense(50)) actor.add(Dropout(0.2)) actor.add(Activation('elu')) actor.add(Dense(nb_actions)) actor.add(Activation('softmax')) # Return values from 0 to 1 # print(actor.summary()) # Build the critic model action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=( 1, observation_size, ), name='observation_input') flattened_observation = Flatten()(observation_input) x = Dense(400)(flattened_observation) x = Activation('relu')(x) x = Concatenate()([x, action_input]) x = Dense(300)(x) x = Activation('relu')(x) x = Dropout(0.3)(x) x = Dense(100)(x) x = Activation('elu')(x) x = Dropout(0.2)(x) x = Dense(50)(x) x = Activation('elu')(x) x = Dropout(0.2)(x) x = Dense(1)(x) x = Activation('tanh')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) # Build the agent memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=2.8, mu=0, sigma=3.5) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=500, nb_steps_warmup_actor=500, random_process=random_process, gamma=.9, target_model_update=5e-2, processor=MujocoProcessor(agent_id)) agent.compile([Adam(lr=5e-1, decay=0.9), Adam(lr=5e-1, decay=0.9)], metrics=['mae']) return agent
x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=2*NUM_STEPS, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) # random_process = OrnsteinUhlenbeckProcess(size=nb_actions, dt = env.tau, theta=1.0, mu=0.0, sigma=0.5, sigma_min=0.3, n_steps_annealing=NUM_STEPS) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.999, target_model_update=1e-3, delta_clip=1.0) agent.compile(Adam(lr=.001, clipnorm=1.0), metrics=['mae']) # Optionally, we can reload a previous model's weights and continue training from there # Remove the _actor or _critic from the filename. The load method automatically # appends these. WEIGHTS_FILENAME = 'weights/ddpg_planar_crane_continuous-v0_weights.h5f' # agent.load_weights(WEIGHTS_FILENAME) callbacks = [] checkpoint_weights_filename = 'weights/ddpg_{}_checkpointWeights_{{step}}_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) log_filename = 'logs/ddpg_{}_log_{}_{}_{}_{}.json'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) #callbacks += [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)]
x = Dense(400)(flattened_observation) x = Activation('relu')(x) x = Concatenate()([x, action_input]) x = Dense(300)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=MujocoProcessor()) agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=1000000, visualize=False, verbose=1) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)