def test_ddpg():
    # TODO: replace this with a simpler environment where we can actually test if it finds a solution
    env = gym.make('Pendulum-v0')
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.shape[0]

    actor = Sequential()
    actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))

    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    
    memory = SequentialMemory(limit=1000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                      memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50,
                      random_process=random_process, gamma=.99, target_model_update=1e-3)
    agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)])

    agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100)
    h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
Beispiel #2
0
                                          theta=.15,
                                          mu=0.,
                                          sigma=.1)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=1000,
                  nb_steps_warmup_actor=1000,
                  batch_size=64,
                  random_process=random_process,
                  gamma=.98,
                  target_model_update=1e-3,
                  processor=MujocoProcessor())
agent.compile([Adam(lr=5e-4), Adam(lr=1e-3)], metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
save_data_path_local = ENV_NAME + '.json'
agent.fit(env,
          nb_steps=1000000,
          visualize=False,
          verbose=1,
          save_data_path=save_data_path_local,
          file_interval=10000)

# After training is done, we save the final weights.
# agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
plot_af(file_path=ENV_NAME + '.json', save_file_name=ENV_NAME + '.png')
Beispiel #3
0
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
#print(critic.summary())

# Set up the agent for training
memory = SequentialMemory(limit=1000000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  delta_clip=1., batch_size=128)
# agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model,
#                            memory=memory, nb_steps_warmup=1000, random_process=random_process,
#                            gamma=.99, target_model_update=0.1)
agent.compile([Nadam(lr=.0001, clipnorm=1.), Nadam(lr=.0001, clipnorm=1.)], metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
if args.train:
    #------------------------------------------------------------
    weights_filename = 'model/ddpg_final.h5f'
    checkpoint_weights_filename = 'model/ddpg_{step}.h5f'
    #log_filename = 'model/ddpg_log.json'.format('opensim')
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000)]
    #callbacks += [FileLogger(log_filename, interval=10000)]
    #------------------------------------------------------------
    agent.load_weights(args.model)
    agent.fit(env, callbacks=callbacks, nb_steps=nallsteps, visualize=False, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000)
    # After training is done, we save the final weights.
Beispiel #4
0
def create_agent(nb_actions, observation_shape):
    """构造 ddpg agent"""

    import os
    import sys
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    keras_rl = os.path.join(os.path.dirname(cur_dir), 'keras-rl')
    sys.path.insert(0, keras_rl)

    from rl.agents import DDPGAgent
    from rl.memory import SequentialMemory
    from rl.random import OrnsteinUhlenbeckProcess

    # 构造 Actor
    actor = Sequential()
    actor.add(Flatten(input_shape=(1, ) + observation_shape))
    actor.add(Dense(32))
    actor.add(Activation('relu'))
    actor.add(Dense(32))
    actor.add(Activation('relu'))
    actor.add(Dense(32))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('tanh'))
    print(actor.summary())

    # 构造 critic
    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + observation_shape,
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    print(critic.summary())

    # 编译模型
    memory = SequentialMemory(limit=100000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                              theta=0.6,
                                              mu=0,
                                              sigma=0.3)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=10,
                      nb_steps_warmup_actor=10,
                      batch_size=64,
                      random_process=random_process,
                      gamma=.999,
                      target_model_update=1e-3)
    agent.compile([Adam(lr=.001, clipnorm=1.),
                   Adam(lr=.001, clipnorm=1.)],
                  metrics=['mae'])

    return agent
Beispiel #5
0
def get_agent(env) -> DDPGAgent:
    """
    Generate a `DDPGAgent` instance that represents an agent learned using
    Deep Deterministic Policy Gradient. The agent has 2 neural networks: an actor
    network and a critic network.

    Args:
    * `env`: An OpenAI `gym.Env` instance

    Returns:
    * a `DDPGAgent` instance.
    """
    assert len(env.action_space.shape) == 1
    nb_actions = env.action_space.shape[0]
    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')

    range_action_input = 0.5 * (env.action_space.high - env.action_space.low)
    constantBias = 1
    lowb = env.action_space.low

    # actor = Flatten(input_shape=(1,) + env.observation_space.shape)(observation_input)
    y = Flatten()(observation_input)
    y = Dense(16)(y)
    y = BatchNormalization()(y)
    y = Activation('relu')(y)
    y = Dense(16)(y)
    y = BatchNormalization()(y)
    y = Activation('relu')(y)
    pht = Dense(1)(y)
    pht = BatchNormalization()(pht)
    pht = Activation('tanh')(pht)
    pht = Lambda(lambda a: (a + K.constant(constantBias)) * K.constant(range_action_input[0])
                           + K.constant(lowb[0]))(pht)
    rht = Dense(1)(y)
    rht = BatchNormalization()(rht)
    rht = Activation('tanh')(rht)
    rht = Lambda(lambda a: (a + K.constant(constantBias)) * K.constant(range_action_input[1])
                           + K.constant(lowb[1]))(rht)
    axn = Concatenate()([pht, rht])
    actor = Model(inputs=observation_input, outputs=axn)

    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(32)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)

    memory = SequentialMemory(limit=1000, window_length=1)

    random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.5, size=nb_actions)
    agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                      memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                      gamma=.99, target_model_update=1e-3, random_process=random_process)
    agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
    return agent
Beispiel #6
0
                                              size=2)

    # Create the agent
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      random_process=random_process,
                      nb_steps_warmup_actor=2048,
                      nb_steps_warmup_critic=1024,
                      target_model_update=1000,
                      gamma=0.9,
                      batch_size=128,
                      memory_interval=2)
    agent.compile([Adam(lr=3e-5), Adam(lr=3e-3)])

    # Start training for 75000 simulation steps
    agent.fit(
        env,
        nb_steps=75000,
        nb_max_start_steps=0,
        nb_max_episode_steps=10000,
        visualize=True,
        action_repetition=1,
        verbose=2,
        log_interval=10000,
        callbacks=[],
    )
    # Test the agent
    hist = agent.test(env,
Beispiel #7
0
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=.15,
                                          mu=0.,
                                          sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3)
#agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
agent.compile('adam', metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env,
          nb_steps=50000,
          visualize=True,
          verbose=2,
          nb_max_episode_steps=200)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
Beispiel #8
0
memory = SequentialMemory(limit=10_000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=0.15,
                                          mu=0.0,
                                          sigma=0.3)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=nb_steps_warmup,
                  nb_steps_warmup_actor=nb_steps_warmup,
                  random_process=random_process,
                  gamma=0.9,
                  target_model_update=1e-3)
agent.compile(SGD(lr=1e-5, clipvalue=0.001), metrics=['mae'])

callbacks = [
    ModelIntervalCheckpoint(weights_name + '_{step}.h5f', interval=10_000),
    TrainEpisodeLogger(),
    TensorBoard()
]

agent.fit(env,
          nb_steps=nb_steps,
          visualize=False,
          verbose=1,
          callbacks=callbacks)
agent.save_weights(weights_name + '_final.h5f', overwrite=True)

# agent.test(env, nb_episodes=1, visualize=False)
Beispiel #9
0
def main(args):
    sigma, learning_rate, file_prefix = args

    env = ModifiedArmEnv(visualize=False)
    input_shape = (1, ) + env.observation_space.shape
    nb_actions = env.action_space.shape[0]

    # Create actor and critic networks
    actor = Sequential()
    actor.add(Flatten(input_shape=input_shape))
    actor.add(Dense(32))
    actor.add(Activation('relu'))
    actor.add(Dense(32))
    actor.add(Activation('relu'))
    actor.add(Dense(32))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('sigmoid'))

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=input_shape, name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = concatenate([action_input, flattened_observation])
    x = Dense(64)(x)
    x = Activation('relu')(x)
    x = Dense(64)(x)
    x = Activation('relu')(x)
    x = Dense(64)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)

    # Set up the agent for training
    memory = SequentialMemory(limit=100000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                              mu=0.,
                                              sigma=sigma,
                                              dt=env.stepsize,
                                              size=env.noutput)
    agent = DDPGAgent(
        nb_actions=nb_actions,
        actor=actor,
        critic=critic,
        critic_action_input=action_input,
        memory=memory,
        nb_steps_warmup_critic=100,
        nb_steps_warmup_actor=100,
        random_process=random_process,
        gamma=.99,
        target_model_update=1e-3,
        delta_clip=1.,
    )
    agent.compile(Adam(lr=learning_rate, clipnorm=1.), metrics=['mae'])

    # Train the model
    training_history = RewardsLogger()
    env.reset()
    agent.fit(
        env,
        nb_steps=100000,
        visualize=False,
        verbose=1,
        nb_max_episode_steps=200,
        log_interval=10000,
        callbacks=[training_history],
    )

    # Save weights and training history
    agent.save_weights(file_prefix + '_weights.h5f', overwrite=True)
    pickledump(training_history, file_prefix + '_training_history.pkl')

    # Set test parameters
    test_nb_episodes = 10
    test_nb_max_episode_steps = 1000

    # Run test
    test_history = ObservationsLogger()
    env.reset()
    agent.test(
        env,
        nb_episodes=test_nb_episodes,
        visualize=False,
        nb_max_episode_steps=test_nb_max_episode_steps,
        callbacks=[test_history],
    )
    # Save test history
    pickledump(test_history, file_prefix + '_test_history.pkl')
Beispiel #10
0
random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                          mu=0.,
                                          sigma=.2,
                                          size=nb_actions)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=modelA,
                  critic=modelC,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3,
                  batch_size=64)  #,processor=CircleProcessor)
agent.compile(Adam(lr=args.learnRate, clipnorm=1.), metrics=['mae'])

if args.reload:
    agent.load_weights(args.reload)

import rl.callbacks


class EpisodeLogger(rl.callbacks.Callback):
    def __init__(self, size_inX, size_outY, size_cells):
        self.ofile = open(os.path.join(args.saveFolder, "log.csv"), "w")
        self.cfile = csv.writer(self.ofile)
        self.cfile.writerow(["episode", "reward"] +
                            ["inX%d" % i for i in range(size_inX)] +
                            ["outY%d" % i for i in range(size_outY)] +
                            ["cell%d" % i for i in range(size_cells)])
Beispiel #11
0
    def create(self):
        """Create the agent"""
        assert len(self.agent_helper.env.action_space.shape) == 1
        nb_actions = int(self.agent_helper.env.action_space.shape[0])

        # set #nodes and #sfs based on env limits. used for splitting the output layer and action processor
        num_nodes = self.agent_helper.env.env_limits.MAX_NODE_COUNT
        num_sfcs = self.agent_helper.env.env_limits.MAX_SF_CHAIN_COUNT
        num_sfs = self.agent_helper.env.env_limits.MAX_SERVICE_FUNCTION_COUNT

        # create the actor NN
        observation_input = Input(
            shape=(1, ) + self.agent_helper.env.observation_space.shape,
            name='observation_input')
        flattened_observation = Flatten()(observation_input)
        prev_layer = flattened_observation
        # create hidden layers according to config
        for num_hidden in self.agent_helper.config['actor_hidden_layer_nodes']:
            hidden_layer = Dense(
                num_hidden,
                activation=self.agent_helper.
                config['actor_hidden_layer_activation'])(prev_layer)
            prev_layer = hidden_layer
        # split output layer into separate parts for each node and SF and apply softmax individually
        out_parts = [
            Dense(num_nodes, activation='softmax')(prev_layer)
            for _ in range(num_nodes * num_sfs)
        ]
        out = Concatenate()(out_parts)
        # normal output layer
        # out = Dense(nb_actions, activation='tanh')(prev_layer)
        actor = Model(inputs=observation_input, outputs=out)

        # create the critic NN
        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(
            shape=(1, ) + self.agent_helper.env.observation_space.shape,
            name='observation_input')
        flattened_observation = Flatten()(observation_input)
        prev_layer = Concatenate()([action_input, flattened_observation])
        # create hidden layers according to config
        for num_hidden in self.agent_helper.config[
                'critic_hidden_layer_nodes']:
            hidden_layer = Dense(
                num_hidden,
                activation=self.agent_helper.
                config['critic_hidden_layer_activation'])(prev_layer)
            prev_layer = hidden_layer
        out_critic = Dense(1, activation='linear')(prev_layer)
        critic = Model(inputs=[action_input, observation_input],
                       outputs=out_critic)

        # write NN summary to string
        actor_summary_lst = []
        actor.summary(print_fn=actor_summary_lst.append)
        actor_summary = "".join(actor_summary_lst)
        actor.summary(print_fn=logger.debug)

        # write NN summary to string
        critic_summary_lst = []
        critic.summary(print_fn=critic_summary_lst.append)
        critic_summary = "".join(critic_summary_lst)
        critic.summary(print_fn=logger.debug)

        # This following line is causing aliasing issues. Ex: 'nb_observation' is added to agent_config
        self.agent_helper.result.agent_config = copy.copy(
            self.agent_helper.config)  # Set agent params in result file
        self.agent_helper.result.agent_config[
            'nb_observation'] = self.agent_helper.env.observation_space.shape[
                0]
        self.agent_helper.result.agent_config['nb_actions'] = nb_actions

        self.agent_helper.result.agent_config['actor'] = {}
        self.agent_helper.result.agent_config['actor'][
            'summary'] = actor_summary

        self.agent_helper.result.agent_config['critic'] = {}
        self.agent_helper.result.agent_config['critic'][
            'summary'] = critic_summary
        self.agent_helper.result.agent_config['metrics'] = ['mae']

        # creating the Agent
        processor = ActionScheduleProcessor(num_nodes=num_nodes,
                                            num_sfcs=num_sfcs,
                                            num_sfs=num_sfs)
        memory = SequentialMemory(
            limit=self.agent_helper.config['mem_limit'],
            window_length=self.agent_helper.config['mem_window_length'])
        random_process = GaussianWhiteNoiseProcess(
            sigma=self.agent_helper.config['rand_sigma'],
            mu=self.agent_helper.config['rand_mu'],
            size=nb_actions)

        agent = DDPGAgent(nb_actions=nb_actions,
                          actor=actor,
                          critic=critic,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=self.agent_helper.
                          config['nb_steps_warmup_critic'],
                          nb_steps_warmup_actor=self.agent_helper.
                          config['nb_steps_warmup_actor'],
                          random_process=random_process,
                          gamma=self.agent_helper.config['gamma'],
                          target_model_update=self.agent_helper.
                          config['target_model_update'],
                          processor=processor,
                          batch_size=64)
        agent.compile(Adam(
            lr=self.agent_helper.config['learning_rate'],
            decay=self.agent_helper.config['learning_rate_decay']),
                      metrics=['mae'])
        self.agent = agent
Beispiel #12
0
x = Dense(64)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, random_process=random_process,
                  gamma=1, target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1., decay=0.9999), metrics=['mae'])

#%%
'''
the test before warm_up
'''
history = agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=1000)
# sio.savemat('test-before-train-' + ENV_NAME + '-' + nowtime + '.mat', history.history)
before = history.history['episode_reward']
'''
warm_up
'''
history = agent.warm_fit(env, policy, policy_list, nb_steps=3e6, visualize=False, log_interval=1000, verbose=2, nb_max_episode_steps=2000)
sio.savemat('warm-up-' + ENV_NAME + '-' + nowtime + '.mat', history.history)
agent.save_weights('ddpg_{}_weights_after_warm_start.h5f'.format(ENV_NAME), overwrite=True)
'''
Beispiel #13
0
def main(layers1=[200],
         layers2=[200],
         leaky_alpha=0.10,
         ENV_NAME='EnvPong',
         show=False,
         wall_reward=-0.1,
         touch_reward=0.3,
         n_steps=80000,
         n_alternances=10,
         L_R=0.0001,
         only_test=False,
         opp_aware=[1, 1],
         myopie=[0.00, 0.00],
         ball_speed=1.0,
         weights1_name='',
         weights2_name=''):

    ENV_NAME = ENV_NAME

    conf_name = "{}_layers1={}__layers2={}__leaky={}__lr={}__opp={}__myopia={}__speed={}".format(
        ENV_NAME, layers1, layers2, leaky_alpha, L_R, opp_aware, myopie,
        ball_speed)
    #gym.undo_logger_setup()
    # Get the environment and extract the number of actions.

    if ENV_NAME == 'Env2D':
        env = Game2D(2.)
    elif ENV_NAME == 'Env2DSoloSpin':
        env = Game2DSolo(2., spinRacket=True)
    elif ENV_NAME == 'Env3DSolo':
        env = Game3DSolo(2., 9.8, 0.5, 7., 3.)
    elif ENV_NAME == 'EnvPong':
        env = Pong(PongPlayer(None, opp_aware=(opp_aware[0] == 1)),
                   PongPlayer(None, opp_aware=(opp_aware[1] == 1)))
    np.random.seed(123)
    #env.seed(123)
    assert len(env.action_space.shape) == 1
    nb_actions = env.action_space.shape[0]

    # Next, we build a very simple model.
    actor = Sequential()
    actor.add(Flatten(input_shape=(1, ) + env.observation_space_1.shape))
    #actor.add(keras.layers.normalization.BatchNormalization())
    for size in layers1:
        actor.add(
            Dense(size,
                  kernel_initializer=RandomUniform(minval=-0.005,
                                                   maxval=0.005,
                                                   seed=None)))
        #actor.add(keras.layers.core.Dropout(0.2))
        actor.add(LeakyReLU(leaky_alpha))
    #actor.add(keras.layers.normalization.BatchNormalization())
    actor.add(
        Dense(nb_actions,
              kernel_initializer=RandomUniform(minval=-0.005,
                                               maxval=0.005,
                                               seed=None),
              bias_regularizer=regularizers.l2(0.01)))
    #actor.add(keras.layers.core.Dropout(0.2))
    actor.add(Activation('linear'))
    print(actor.summary())

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + env.observation_space_1.shape,
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = merge([action_input, flattened_observation], mode='concat')
    #x = keras.layers.normalization.BatchNormalization()(x)
    for size in layers1:
        x = Dense(size)(x)
        #x = keras.layers.core.Dropout(0.2)(x)
        x = LeakyReLU(alpha=leaky_alpha)(x)
    #x = keras.layers.normalization.BatchNormalization()(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(input=[action_input, observation_input], output=x)
    print(critic.summary())

    actor2 = Sequential()
    actor2.add(Flatten(input_shape=(1, ) + env.observation_space_2.shape))
    #actor2.add(keras.layers.normalization.BatchNormalization())
    for size in layers2:
        actor2.add(
            Dense(size,
                  kernel_initializer=RandomUniform(minval=-0.005,
                                                   maxval=0.005,
                                                   seed=None)))
        #actor2.add(keras.layers.core.Dropout(0.2))
        actor2.add(LeakyReLU(alpha=leaky_alpha))
    actor2.add(
        Dense(nb_actions,
              kernel_initializer=RandomUniform(minval=-0.005,
                                               maxval=0.005,
                                               seed=None),
              bias_regularizer=regularizers.l2(0.01)))
    #actor2.add(keras.layers.core.Dropout(0.2))
    actor2.add(Activation('linear'))
    print(actor2.summary())

    action_input2 = Input(shape=(nb_actions, ), name='action_input')
    observation_input2 = Input(shape=(1, ) + env.observation_space_2.shape,
                               name='observation_input')
    flattened_observation2 = Flatten()(observation_input2)
    x2 = merge([action_input2, flattened_observation2], mode='concat')
    #x2 = keras.layers.normalization.BatchNormalization()(x2)
    for size in layers2:
        x2 = Dense(size)(x2)
        #x2 = keras.layers.core.Dropout(0.2)(x2)
        x2 = LeakyReLU(alpha=leaky_alpha)(x2)
    x2 = Dense(1)(x2)
    x2 = Activation('linear')(x2)
    critic2 = Model(input=[action_input2, observation_input2], output=x2)
    print(critic2.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory1 = SequentialMemory(limit=50000, window_length=1)
    if opp_aware[0] != opp_aware[1]:
        memory2 = SequentialMemory(limit=50000, window_length=1)
    else:
        memory2 = memory1
    random_process1 = OrnsteinUhlenbeckProcess(size=nb_actions,
                                               theta=.1,
                                               mu=0.,
                                               sigma=.15,
                                               sigma_min=0.,
                                               n_steps_annealing=n_steps /
                                               4)  # Explores less at the end ?
    random_process2 = OrnsteinUhlenbeckProcess(size=nb_actions,
                                               theta=.1,
                                               mu=0.,
                                               sigma=.15,
                                               sigma_min=0.,
                                               n_steps_annealing=4 * n_steps)
    agent1 = DDPGAgent(nb_actions=nb_actions,
                       actor=actor,
                       critic=critic,
                       critic_action_input=action_input,
                       memory=memory1,
                       nb_steps_warmup_critic=5000,
                       nb_steps_warmup_actor=5000,
                       random_process=random_process1,
                       gamma=.99,
                       target_model_update=1e-3,
                       batch_size=100)
    agent2 = DDPGAgent(nb_actions=nb_actions,
                       actor=actor2,
                       critic=critic2,
                       critic_action_input=action_input2,
                       memory=memory2,
                       nb_steps_warmup_critic=5000,
                       nb_steps_warmup_actor=5000,
                       random_process=random_process2,
                       gamma=.99,
                       target_model_update=1e-3,
                       batch_size=100)

    #agent.compile(Adam(lr=L_R, clipnorm=1., clipvalue=0.5), metrics=['mae'])
    agent1.compile(Adam(lr=L_R, clipnorm=1.), metrics=['mae'])
    agent2.compile(Adam(lr=L_R, clipnorm=1.), metrics=['mae'])

    player1 = PongPlayer(agent1,
                         myopie=myopie[0],
                         opp_aware=(opp_aware[0] == 1))
    player2 = PongPlayer(agent2,
                         myopie=myopie[1],
                         opp_aware=(opp_aware[1] == 1))

    # Grid -4
    # Add -1 when lost
    # CEM method

    directory_log = "logs/ddpg/{}".format(conf_name)
    directory_weights = "weights/ddpg/{}".format(conf_name)

    if not os.path.exists(directory_log):
        os.makedirs(directory_log)
    if not os.path.exists(directory_weights):
        os.makedirs(directory_weights)

    if only_test:
        '''if weights1_name =='':
            weights1_name = "{}/player1_final".format(directory_weights)
        if weights2_name == '':
            weights2_name = "{}/player2_final".format(directory_weights)
        #if os.path.isfile(weights1_name) and os.path.isfile(weights2_name):
        agent1.load_weights(weights1_name)
        agent2.load_weights(weights2_name)'''

        agent1.load_weights("{}/player1_{}".format(directory_weights, "final"))
        agent2.load_weights("{}/player1_{}".format(directory_weights, "final"))

        env = makeEnv(player1, player2, ENV_NAME, ball_speed=ball_speed)
        for i in range(10):
            playPong(env)
        confrontPlayers(env)
        plotStrategy(env)

    else:

        for i in range(n_alternances):

            print "Alternance n {} \n".format(i)

            def learning_rate_schedule(epoch):
                return L_R

            if ENV_NAME == 'Env2D':
                env = Game2D(agent2,
                             wall_reward=wall_reward,
                             touch_reward=touch_reward)
            elif ENV_NAME == 'EnvPong':
                env = Pong(player1,
                           player2,
                           wall_reward=wall_reward,
                           touch_reward=touch_reward,
                           ball_speed=ball_speed)
            agent1.fit(env,
                       nb_steps=n_steps,
                       visualize=False,
                       verbose=1,
                       until_score=True,
                       score_to_reach=0.5,
                       last_episodes=500,
                       nb_max_episode_steps=None,
                       callbacks=[
                           FileLogger("{}/player1_{}.h5f".format(
                               directory_log, i)),
                           keras.callbacks.LearningRateScheduler(
                               learning_rate_schedule)
                       ])
            agent1.test(env,
                        nb_episodes=100,
                        visualize=False,
                        nb_max_episode_steps=500,
                        verbose=1)
            agent1.save_weights("{}/player1_{}".format(directory_weights, i),
                                overwrite=True)
            agent1.memory = SequentialMemory(limit=500000, window_length=1)
            wall_reward = wall_reward * 0.8
            touch_reward = touch_reward * 0.8
            agent2.load_weights("{}/player1_{}".format(directory_weights, i))

        print "Fin de {}".format(conf_name)
        env = Pong(player1,
                   player2,
                   wall_reward=wall_reward,
                   touch_reward=touch_reward,
                   ball_speed=ball_speed)

        #agent1.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=None,callbacks=[FileLogger("logs/ddpg/{}_weights_steps_leaky_reg_bias_drop_lr{}.h5f".format(ENV_NAME,L_R), interval=100)])
        agent1.save_weights("{}/player1_final".format(directory_weights),
                            overwrite=True)
        agent2.save_weights("{}/player2_final".format(directory_weights),
                            overwrite=True)

        agent1.test(env,
                    nb_episodes=15,
                    visualize=False,
                    nb_max_episode_steps=500,
                    verbose=2)

    if show == True:

        if ENV_NAME == 'Env2D':
            for i in range(10):
                play2D(player1=agent1, player2=agent1)
        elif ENV_NAME == 'EnvPong':
            for i in range(10):
                playPong(left=agent1, right=agent2)
Beispiel #14
0
class RLAgent:
    def __init__(self):
        ENV_NAME = 'drone'
        # Get the environment and extract the number of actions.
        #env = gym.make(ENV_NAME)
        env = drone_sim()
        np.random.seed(123)
        env.seed(123)
        assert len(env.action_space.shape) == 1
        nb_actions = env.action_space.shape[0]

        # Next, we build a very simple model.
        self.actor = Sequential()
        self.actor.add(Flatten(input_shape=(1, ) +
                               env.observation_space.shape))
        self.actor.add(Dense(16))
        self.actor.add(Activation('relu'))
        self.actor.add(Dense(16))
        self.actor.add(Activation('relu'))
        self.actor.add(Dense(16))
        self.actor.add(Activation('relu'))
        self.actor.add(
            Dense(nb_actions,
                  activation='tanh',
                  kernel_initializer=RandomUniform()))
        self.actor.add(Lambda(lambda x: x * 60.0))
        print(self.actor.summary())

        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(shape=(1, ) + env.observation_space.shape,
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Concatenate()([action_input, flattened_observation])
        x = Dense(32)(x)
        x = Activation('relu')(x)
        x = Dense(32)(x)
        x = Activation('relu')(x)
        x = Dense(32)(x)
        x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)
        critic = Model(inputs=[action_input, observation_input], outputs=x)
        print(critic.summary())

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=.15,
                                                  mu=0.,
                                                  sigma=.3)
        self.agent = DDPGAgent(nb_actions=nb_actions,
                               actor=self.actor,
                               critic=critic,
                               critic_action_input=action_input,
                               memory=memory,
                               nb_steps_warmup_critic=100,
                               nb_steps_warmup_actor=100,
                               random_process=random_process,
                               gamma=.99,
                               target_model_update=1e-3)
        self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
Beispiel #15
0
critic = Model(input=[action_input, observation_input], output=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  delta_range=(-100., 100.))
# agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model,
#                            memory=memory, nb_steps_warmup=1000, random_process=random_process,
#                            gamma=.99, target_model_update=0.1)
#agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
agent.compile([RMSprop(lr=.001), RMSprop(lr=.001)], metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
if args.train:
    agent.fit(env, nb_steps=nallsteps, visualize=True, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000)
    # After training is done, we save the final weights.
    agent.save_weights(args.output, overwrite=True)

if not args.train:
    agent.load_weights(args.output)
    # Finally, evaluate our algorithm for 5 episodes.

    if args.env != "Arm":
        agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=500)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=HYP.THETA,
                                          mu=HYP.MU,
                                          sigma=HYP.SIGMA)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  batch_size=HYP.BATCH_SIZE,
                  nb_steps_warmup_actor=HYP.WARMUP_ACTOR,
                  nb_steps_warmup_critic=HYP.WARMUP_CRITIC,
                  random_process=random_process,
                  gamma=HYP.GAMMA,
                  target_model_update=HYP.TAU)
agent.compile(Adam(lr=HYP.LEARN_R, clipnorm=HYP.CLIPNORM), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for
# show, but this slows down training quite a lot. You can always safely abort
# the training prematurely using Ctrl + C.
agent.fit(env,
          nb_steps=HYP.NB_STEPS,
          visualize=False,
          callbacks=[file_logger],
          verbose=HYP.VERBOSE)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env,
Beispiel #17
0
class PartnerApproximatingLearner(Controller):
    """ A controller that learns how the other agent behaves and adapts to that behavior """
    def __init__(self,
                 first_player: bool,
                 stop_ident_time=1e9,
                 do_rl=False,
                 learning_rate=0.01,
                 activation_fcn='relu',
                 learn_time_delta=0.2,
                 rl_time_delta=0.1,
                 epochs=2,
                 fit_batch_size=20,
                 learn_stack=LearningStack(),
                 real_env=CoopPendulum(),
                 rl_memory_span=50,
                 wolf=0.,
                 win_lr_reduction=1,
                 wolf_stop_rl=False):
        """ Sets various parameters, configures the ident, actor and critic NN and compiles the agent"""
        super(PartnerApproximatingLearner, self).__init__(
            first_player)  # Call to __init__ of parent class Controller
        self.learn_stack = learn_stack  # Controller specific LearningStack in which to save the experiences
        self.loosing_lr = learning_rate
        self.rl_lr = .001  # hyper-parameter
        self.win_lr_reduction = win_lr_reduction
        self.wolf = wolf
        self.wolf_stop_rl = wolf_stop_rl
        seed = np.random.randint(0, int(1e6)) + int(
            first_player
        ) * 100  # -> first player gets different seed than second

        # Configure neural network for identification:
        num_hidden_layer_ident = 3
        num_neurons_per_layer_ident = 16
        act_space_shape = real_env.action_space.shape
        obs_space_shape = real_env.observation_space.shape
        ident_nn = Sequential()
        ident_nn.add(
            Dense(num_neurons_per_layer_ident,
                  kernel_initializer=RandomUniform(minval=-1,
                                                   maxval=1,
                                                   seed=seed),
                  input_shape=obs_space_shape))
        for i in range(num_hidden_layer_ident -
                       1):  # Add the layers to the identification NN
            ident_nn.add(
                Dense(num_neurons_per_layer_ident,
                      kernel_initializer=RandomUniform(minval=-1,
                                                       maxval=1,
                                                       seed=seed + i)))
            ident_nn.add(Activation(activation_fcn))
        ident_nn.add(
            Dense(act_space_shape[0],
                  kernel_initializer=RandomUniform(minval=-0.0001,
                                                   maxval=0.0001,
                                                   seed=seed + 9)))
        ident_nn.add(Activation('linear'))
        opt = Adam(lr=learning_rate)  # hyper-parameter
        ident_nn.compile(optimizer=opt, loss='mse')  # hyper-parameter

        # Use the neural network inside a NNController for easy evaluation of the output:
        self.ident_ctrl = StaticNNController(
            first_player=(not self.first_player), neural_net=ident_nn)

        # Set other identification parameters
        self.ident_time_delta = learn_time_delta  # simulation time between training the other_model with experience
        self.last_ident_time = 0  # last time ident NN was trained
        self.epochs = epochs  # number of training epochs when its time to identify again
        self.fit_batch_size = fit_batch_size  # size of mini batch that the batch is split into for training by Keras
        self.stop_ident_time = stop_ident_time  # Time at which no training should occur anymore. Used for testing
        self.do_rl = do_rl
        if do_rl:
            self.rl_env = deepcopy(real_env)
            self.last_rl_time = -1
            self.rl_time_delta = rl_time_delta
            self.rl_env.set_ctrl_other(self.ident_ctrl)
            try:
                self.u_limit = self.rl_env.action_space_u1 if first_player else self.rl_env.action_space_u2
            except AttributeError:  # rl_env does not have individual limits
                self.u_limit = self.rl_env.action_space

            # Configure the Neural Networks of the RL-agent
            # 1. Actor:
            rl_num_hidden_layer_actor = 3
            rl_num_neurons_per_layer_actor = 16
            rl_actor = Sequential(
            )  # Actor is a Sequential Neural Network (MLP)
            rl_actor.add(Flatten(input_shape=(1, ) + obs_space_shape))
            for i in range(rl_num_hidden_layer_actor
                           ):  # Add the layers to the actor NN
                rl_actor.add(
                    Dense(rl_num_neurons_per_layer_actor,
                          kernel_initializer=RandomUniform(minval=-1,
                                                           maxval=1,
                                                           seed=seed + 10 +
                                                           i)))
                rl_actor.add(Activation(activation_fcn))
            rl_actor.add(
                Dense(act_space_shape[0],
                      kernel_initializer=RandomUniform(minval=-1,
                                                       maxval=1,
                                                       seed=seed + 19)))
            rl_actor.add(Activation('linear'))

            # 2. Critic:
            rl_num_hidden_layer_critic = 3
            rl_num_neurons_per_layer_critic = 32
            action_input = Input(shape=act_space_shape, name='action_input')
            observation_input = Input(shape=(1, ) + obs_space_shape,
                                      name='observation_input')
            flattened_observation = Flatten()(observation_input)
            rl_critic_nn = Concatenate()([action_input, flattened_observation])
            for i in range(rl_num_hidden_layer_critic):
                rl_critic_nn = Dense(rl_num_neurons_per_layer_critic,
                                     kernel_initializer=RandomUniform(
                                         minval=-1,
                                         maxval=1,
                                         seed=seed + 20 + i))(rl_critic_nn)
                rl_critic_nn = Activation(activation_fcn)(rl_critic_nn)
            rl_critic_nn = Dense(
                1,
                kernel_initializer=RandomUniform(minval=-1,
                                                 maxval=1,
                                                 seed=seed + 29))(rl_critic_nn)
            rl_critic_nn = Activation('linear')(rl_critic_nn)
            rl_critic = Model(inputs=[action_input, observation_input],
                              outputs=rl_critic_nn)

            # 3. Set training parameters for the Agent and compile it
            rl_frames_per_train = 200
            rl_mem_size = int(
                rl_memory_span *
                (round(1 / self.rl_time_delta) * rl_frames_per_train))
            rl_memory = SequentialMemory(limit=rl_mem_size, window_length=1)
            random_process = OrnsteinUhlenbeckProcess(size=act_space_shape[0],
                                                      theta=.15,
                                                      mu=0.,
                                                      sigma=.3)
            self.rl_agent = DDPGAgent(nb_actions=act_space_shape[0],
                                      actor=rl_actor,
                                      critic=rl_critic,
                                      critic_action_input=action_input,
                                      memory=rl_memory,
                                      nb_steps_warmup_critic=100,
                                      nb_steps_warmup_actor=100,
                                      random_process=random_process,
                                      gamma=.99,
                                      target_model_update=1e-3)
            self.rl_agent.compile(Adam(lr=self.rl_lr, clipnorm=1.),
                                  metrics=['mae'])
            self.rl_actor_ctrl = StaticNNController(
                first_player=self.first_player, neural_net=rl_actor)

    def ident_other(self):
        """ Updates Identification of the partner """
        batch = self.learn_stack.pick_random(
        )  # get a batch from the LearningStack
        batch_t, batch_x, batch_u = zip(*batch)
        batch_u1, batch_u2 = zip(*batch_u)
        inputs = np.asarray(batch_x)
        if self.first_player:  # -> player 2 has to be identified
            outputs = np.reshape(np.array(batch_u2), (-1, 1))
        else:  # -> player 1 has to be identified
            outputs = np.reshape(np.array(batch_u1), (-1, 1))
        self.ident_ctrl.neural_net.fit(inputs,
                                       outputs,
                                       batch_size=self.fit_batch_size,
                                       epochs=self.epochs,
                                       verbose=0,
                                       shuffle=True,
                                       validation_split=0.)

    def u(self, t, x) -> float:
        """ Calculates the control variable u of the learning controller (only for the "real" environment)
            The action inside the internal simulation is calculated by the actor NN and clipped by the env """
        if self.do_rl:
            u_self = self.rl_actor_ctrl.u(t, x)
            u_self = min(max(u_self, np.ndarray.item(self.u_limit.low)),
                         np.ndarray.item(self.u_limit.high))
        else:
            u_self = 0.
        return u_self

    def get_other_pred(self, t, x):
        """ Returns the expected output of the other controller for the given input """
        u_other_pred = self.ident_ctrl.u(t, x)
        return u_other_pred

    def calc_error_on_learning_stack(self):
        stack = self.learn_stack.get_all_experiences()
        stack_t, stack_x, stack_u = zip(*stack)
        stack_u1, stack_u2 = zip(*stack_u)
        predictions = list()
        for i in range(len(stack_x)):
            predictions.append(self.get_other_pred(stack_t[i], stack_x[i]))

        assert len(predictions) == len(stack_u1)
        assert len(predictions) == len(stack_u2)

        if self.first_player:  # Predicting Player 2
            error = [(stack_u2[j] - predictions[j])**2
                     for j in range(len(predictions))]
        else:  # Predicting Player 1
            error = [(stack_u1[j] - predictions[j])**2
                     for j in range(len(predictions))]

        mse = sum(error) / len(error)
        return mse

    def new_exp(self, exp):
        """ Saves the new experience (time, state, control variables) on the stack and
            triggers rl/ident if enough time passed """
        self.learn_stack.add(exp[0:3])
        t_now = exp[0]
        winning = False
        if len(
                exp
        ) > 3:  # if "real" reward is supplied: check if within winning limits
            winning = exp[3] > self.wolf  # hyper-parameter

        if self.do_rl and round(
                t_now - self.last_rl_time,
                5) >= self.rl_time_delta:  # enough time passed since last RL
            if winning:
                K.set_value(self.rl_agent.actor_optimizer.lr,
                            self.rl_lr / self.win_lr_reduction)
                K.set_value(self.rl_agent.critic.optimizer.optimizer.lr,
                            self.rl_lr / self.win_lr_reduction)
            else:
                K.set_value(self.rl_agent.actor_optimizer.lr, self.rl_lr)
                K.set_value(self.rl_agent.critic.optimizer.optimizer.lr,
                            self.rl_lr)
            if not (self.wolf_stop_rl and winning):
                self.improve_policy()
                self.last_rl_time = t_now

        if round(t_now - self.last_ident_time,
                 5) >= self.ident_time_delta and t_now < self.stop_ident_time:
            if winning:
                K.set_value(self.ident_ctrl.neural_net.optimizer.lr,
                            self.loosing_lr / self.win_lr_reduction)
            else:
                K.set_value(self.ident_ctrl.neural_net.optimizer.lr,
                            self.loosing_lr)
            self.ident_other(
            )  # train my model of the other controller on data from my LearningStack
            self.last_ident_time = t_now

    def improve_policy(self):
        """ Does an episode of RL to improve critic and actor of the rl_agent """
        self.rl_agent.fit(self.rl_env,
                          nb_steps=200,
                          visualize=False,
                          verbose=0,
                          nb_max_episode_steps=200)
Beispiel #18
0
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0.1, sigma=0.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(lr=0.001), metrics=['mae'])
#agent.compile(Adam(lr=0.001), metrics=['mae'])
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=473780)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
Beispiel #19
0
def train():
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    np.random.seed(123)
    env.seed(123)
    assert len(env.action_space.shape) == 1
    nb_actions = env.action_space.shape[0]

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Next, we build a very simple model.
    actor = Sequential()
    actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))
    # print(actor.summary())

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + env.observation_space.shape,
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    # print(critic.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=100000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                              theta=.15,
                                              mu=0.,
                                              sigma=.3)

    if REWARD == "normal":
        ddpg_normal = DDPGAgent(nb_actions=nb_actions,
                                actor=actor,
                                critic=critic,
                                critic_action_input=action_input,
                                memory=memory,
                                nb_steps_warmup_critic=100,
                                nb_steps_warmup_actor=100,
                                random_process=random_process,
                                gamma=.99,
                                target_model_update=1e-3)
        ddpg_normal.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])

        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        history_normal = ddpg_normal.fit(env,
                                         nb_steps=150000,
                                         visualize=False,
                                         verbose=2,
                                         nb_max_episode_steps=200)

        # After training is done, we save the final weights.
        ddpg_normal.save_weights(os.path.join(
            LOG_DIR, 'ddpg_normal_{}_weights.h5f'.format(ENV_NAME)),
                                 overwrite=True)
        # Finally, evaluate our algorithm for 5 episodes.
        ddpg_normal.test(env,
                         nb_episodes=5,
                         visualize=False,
                         verbose=2,
                         nb_max_episode_steps=200)

        pandas.DataFrame(history_normal.history).to_csv(
            os.path.join(LOG_DIR, "normal.csv"))

    elif REWARD == "noisy":
        processor_noisy = PendulumSurrogateProcessor(weight=WEIGHT,
                                                     surrogate=False,
                                                     noise_type=NOISE_TYPE)
        ddpg_noisy = DDPGAgent(nb_actions=nb_actions,
                               actor=actor,
                               critic=critic,
                               critic_action_input=action_input,
                               memory=memory,
                               nb_steps_warmup_critic=100,
                               nb_steps_warmup_actor=100,
                               random_process=random_process,
                               gamma=.99,
                               target_model_update=1e-3,
                               processor=processor_noisy)
        ddpg_noisy.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])
        history_noisy = ddpg_noisy.fit(env,
                                       nb_steps=150000,
                                       visualize=False,
                                       verbose=2,
                                       nb_max_episode_steps=200)
        ddpg_noisy.save_weights(os.path.join(
            LOG_DIR, 'ddpg_noisy_{}_weights.h5f'.format(ENV_NAME)),
                                overwrite=True)
        ddpg_noisy.test(env,
                        nb_episodes=5,
                        visualize=False,
                        verbose=2,
                        nb_max_episode_steps=200)

        pandas.DataFrame(history_noisy.history).to_csv(
            os.path.join(LOG_DIR, "noisy.csv"))

    elif REWARD == "surrogate":
        processor_surrogate = PendulumSurrogateProcessor(weight=WEIGHT,
                                                         surrogate=True,
                                                         noise_type=NOISE_TYPE)
        ddpg_surrogate = DDPGAgent(nb_actions=nb_actions,
                                   actor=actor,
                                   critic=critic,
                                   critic_action_input=action_input,
                                   memory=memory,
                                   nb_steps_warmup_critic=100,
                                   nb_steps_warmup_actor=100,
                                   random_process=random_process,
                                   gamma=.99,
                                   target_model_update=1e-3,
                                   processor=processor_surrogate)
        ddpg_surrogate.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])
        history_surrogate = ddpg_surrogate.fit(env,
                                               nb_steps=150000,
                                               visualize=False,
                                               verbose=2,
                                               nb_max_episode_steps=200)

        ddpg_surrogate.save_weights(os.path.join(
            LOG_DIR, 'ddpg_surrogate_{}_weights.h5f'.format(ENV_NAME)),
                                    overwrite=True)
        ddpg_surrogate.test(env,
                            nb_episodes=5,
                            visualize=False,
                            verbose=2,
                            nb_max_episode_steps=200)

        pandas.DataFrame(history_surrogate.history).to_csv(
            os.path.join(LOG_DIR, "surrogate.csv"))

    else:
        raise NotImplementedError
    def __init__(self,
                 env: gym.Env,
                 logger=Logger(),
                 n_layers_actor=3,
                 n_units_actor=16,
                 n_layers_critic=3,
                 n_units_critic=32,
                 sigma_decay=1,
                 sigma=0.3):
        nb_actions = env.action_space.shape[0]

        ###
        #        obs_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
        #        x = Flatten()(obs_input)
        #        x = Dense(units=256, activation='relu')(x)
        #
        #        action_input = Input(shape=(nb_actions,), name='action_input')
        #        x_c = Concatenate()([x, action_input])
        #
        #        x_critic = Dense(units=128, activation='relu')(x_c)
        #        q_value = Dense(units=1)(x_critic)
        #
        #        x_actor = Dense(units=128, activation='relu')(x)
        #        action = Dense(units=nb_actions, activation='tanh')(x_actor)
        #
        #        actor = Model(inputs=obs_input, outputs = action)
        #        critic = Model(inputs=[action_input, obs_input], outputs = q_value)

        obs_input_actor = Input(shape=(1, ) + env.observation_space.shape,
                                name='observation_input')
        x_ac = Flatten()(obs_input_actor)
        x_ac = Dense(units=256, activation='relu')(x_ac)

        obs_input_critic = Input(shape=(1, ) + env.observation_space.shape,
                                 name='observation_input')
        x_cr = Flatten()(obs_input_critic)
        x_cr = Dense(units=256, activation='relu')(x_cr)
        action_input = Input(shape=(nb_actions, ), name='action_input')
        x_cr = Concatenate()([x_cr, action_input])

        x_critic = Dense(units=128, activation='relu')(x_cr)
        q_value = Dense(units=1)(x_critic)

        x_actor = Dense(units=128, activation='relu')(x_ac)
        action = Dense(units=nb_actions, activation='tanh')(x_actor)

        actor = Model(inputs=obs_input_actor, outputs=action)
        critic = Model(inputs=[action_input, obs_input_critic],
                       outputs=q_value)

        #        actor = Sequential()
        #        actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
        #        for i in range(n_layers_actor):
        #            actor.add(Dense(n_units_actor))
        #            #actor.add(BatchNormalization())
        #            actor.add(Activation('relu'))
        #            #actor.add(LeakyReLU())
        #        actor.add(Dense(nb_actions))
        #        actor.add(Activation('tanh'))
        #
        #        action_input = Input(shape=(nb_actions,), name='action_input')
        #        observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
        #        flattened_observation = Flatten()(observation_input)
        #        x = Concatenate()([action_input, flattened_observation])
        #        for i in range(n_layers_critic):
        #            x = Dense(n_units_critic)(x)
        #            #x = BatchNormalization()(x)
        #            x = Activation('relu')(x)
        #            #x = LeakyReLU()(x)
        #        x = Dense(1)(x)
        #        x = Activation('linear')(x)
        #        critic = Model(inputs=[action_input, observation_input], outputs=x)
        #
        #        action_input = Input(shape=(nb_actions,), name='action_input')
        #        observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
        #        flattened_observation = Flatten()(observation_input)
        #        xo = Dense(n_units_critic, activation='relu')(flattened_observation)
        #        #xo = Dense(n_units_critic, activation='relu')(xo)
        #        x = Concatenate()([xo, action_input])
        #        for i in range(n_layers_critic-1):
        #            x = Dense(n_units_critic, activation='relu')(x)
        #        x = Dense(1)(x)
        #        x = Activation('linear')(x)
        #        critic = Model(inputs=[action_input, observation_input], outputs=x)

        memory = SequentialMemory(limit=1000000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=.15,
                                                  mu=0.,
                                                  sigma=sigma)
        agent = DDPGAgent(nb_actions=nb_actions,
                          actor=actor,
                          critic=critic,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=1000,
                          nb_steps_warmup_actor=1000,
                          random_process=random_process,
                          gamma=.99,
                          target_model_update=1e-3,
                          batch_size=64,
                          train_interval=4)
        agent.compile([Adam(lr=.0001, clipnorm=1.),
                       Adam(lr=.0001)],
                      metrics=['mae'])
        self.agent = agent
        self.env = env
        self.sigma_decay = sigma_decay
        super().__init__(env, logger)
Beispiel #21
0
class KerasDDPGAgent(KerasAgent):
    """
    An DDPG agent using Keras library with Keras RL.

    For more details about Deep Deterministic Policy Gradient algorithm, check
    "Continuous control with deep reinforcement learning" by Lillicrap.
    https://arxiv.org/abs/1509.02971
    """
    def __init__(self,
                 observation_space,
                 action_space,
                 filename='KerasDDPGAgent.h5f'):
        nb_actions = action_space.shape[0]

        # Actor network
        actor = Sequential()
        actor.add(Flatten(input_shape=(1, ) + observation_space.shape))
        actor.add(Dense(32))
        actor.add(Activation('relu'))
        actor.add(Dense(32))
        actor.add(Activation('relu'))
        actor.add(Dense(32))
        actor.add(Activation('relu'))
        actor.add(Dense(nb_actions))
        actor.add(Activation('sigmoid'))
        print(actor.summary())

        # Critic network
        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(shape=(1, ) + observation_space.shape,
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = concatenate([action_input, flattened_observation])
        x = Dense(64)(x)
        x = Activation('relu')(x)
        x = Dense(64)(x)
        x = Activation('relu')(x)
        x = Dense(64)(x)
        x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)
        critic = Model(inputs=[action_input, observation_input], outputs=x)
        print(critic.summary())

        # Setup Keras RL's DDPGAgent
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(theta=.15,
                                                  mu=0.,
                                                  sigma=.2,
                                                  size=nb_actions)
        self.agent = DDPGAgent(nb_actions=nb_actions,
                               actor=actor,
                               critic=critic,
                               critic_action_input=action_input,
                               memory=memory,
                               nb_steps_warmup_critic=100,
                               nb_steps_warmup_actor=100,
                               random_process=random_process,
                               gamma=.99,
                               target_model_update=1e-3,
                               delta_clip=1.)
        self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

        self.filename = filename
Beispiel #22
0
x = Dense(400)(flattened_observation)
x = Activation('relu')(x)
x = Concatenate()([x, action_input])
x = Dense(300)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  processor=MujocoProcessor())
agent.compile([Adam(learning_rate=1e-4), Adam(learning_rate=1e-3)], metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=1000000, visualize=False, verbose=1)

# After training is done, we save the final weights.
agent.save_weights(f'ddpg_{ENV_NAME}_weights.h5f', overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
random_process = GaussianWhiteNoiseProcess(mu=0.0,
                                           sigma=0.8,
                                           sigma_min=0.05,
                                           n_steps_annealing=650000)

# Create the agent
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  random_process=random_process,
                  nb_steps_warmup_actor=32,
                  nb_steps_warmup_critic=32,
                  target_model_update=1e-4,
                  gamma=0.9,
                  batch_size=32)
agent.compile(Adam(lr=1e-4), metrics=['mae'])

# Start training for 7.5M simulation steps (1.5M training steps with actions repeated 5 times)
agent.fit(env,
          nb_steps=1500000,
          visualize=False,
          action_repetition=5,
          verbose=2,
          nb_max_start_steps=0,
          log_interval=10000,
          callbacks=[])

# Test the agent
hist = agent.test(env, nb_episodes=10, action_repetition=1, visualize=True)
                                          size=env.noutput)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3,
                  delta_clip=1.)  # warmup? delta_clip?
# agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model,
#                            memory=memory, nb_steps_warmup=1000, random_process=random_process,
#                            gamma=.99, target_model_update=0.1)
agent.compile(Adam(lr=.001, clipnorm=1.),
              metrics=['mae'])  #critic learning rate?

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
if args.train:
    agent.fit(env,
              nb_steps=nallsteps,
              visualize=False,
              verbose=1,
              nb_max_episode_steps=env.timestep_limit,
              log_interval=10000)
    # After training is done, we save the final weights.
    agent.save_weights(args.model, overwrite=True)

# If TEST and TOKEN, submit to crowdAI
critic = Model(input=[action_input, observation_input], output=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  delta_range=(-100., 100.))
# agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model,
#                            memory=memory, nb_steps_warmup=1000, random_process=random_process,
#                            gamma=.99, target_model_update=0.1)
#agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
agent.compile([RMSprop(lr=.001), RMSprop(lr=.001)], metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
if args.train:
    agent.fit(env, nb_steps=nallsteps, visualize=True, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000)
    # After training is done, we save the final weights.
    agent.save_weights(args.output, overwrite=True)

if not args.train:
    agent.load_weights(args.output)
    # Finally, evaluate our algorithm for 5 episodes.

    if args.env != "Arm":
        agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=500)
#setup agent, using defined keras model alog with the policy and actions from above

#Discrete actions:
policy = EpsGreedyQPolicy()
testPolicy = GreedyQPolicy()
#agent = DQNAgent(model=actorModel, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, policy=policy, test_policy=testPolicy)

#continuous actions:
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=.15,
                                          mu=0.,
                                          sigma=.3)
agent = DDPGAgent(actor=actorModel,
                  critic=criticModel,
                  nb_actions=nb_actions,
                  memory=memory,
                  nb_steps_warmup_actor=100,
                  nb_steps_warmup_critic=100,
                  critic_action_input=action_input,
                  random_process=random_process)

#compile model
agent.compile(Nadam(lr=1e-3, clipnorm=0.1), metrics=['mae'])

# Okay, now it's time to learn something!
# We visualize the training here for show, but this slows down training quite a lot.
agent.fit(env, nb_steps=50000, visualize=True, verbose=2)

#TEST!
#blockingVar = input('Press a key!: ')
agent.test(env, nb_episodes=5, visualize=True)
Beispiel #27
0
if(args.PER==False):
	memory = NonSequentialMemory(limit=args.memory_size, window_length=1)
elif(args.PER==True):
	memory = PrioritisedNonSequentialMemory(limit=args.memory_size, alpha=args.alpha, beta=args.beta, window_length=1) ## 'proportional' priority replay implementation
else:
	print("\nRun vanilla_keras_rl/keras-rl/examples/ddpg_mujoco.py for no PER or HER!")
	sys.exit(1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)

## WARNING: make sure memory_interval is 1 for HER to work
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, pretanh_model=pretanh_model ,critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, batch_size=args.batch_size,
                  delta_clip=args.delta_clip, random_process=random_process, gamma=args.gamma,
                  target_model_update=args.soft_update, do_HER=args.HER, K=args.K, HER_strategy=args.her_strategy,
                  do_PER=args.PER, epsilon=1e-4, processor=MujocoProcessor(), pretanh_weight=args.pretanh_weight)
agent.compile([Adam(lr=args.actor_lr, clipnorm=args.actor_gradient_clip), Adam(lr=args.critic_lr, clipnorm=args.critic_gradient_clip)], metrics=['mae'])

if(args.HER==True and args.PER==False):
	print("\nTraining with Hindsight Experience Replay\n")
	save_data_path_local = 'HER/'+args.ENV_NAME+'.json'
elif(args.HER==False and args.PER==True):
	print("\nTraining with Prioritised Experience Replay\n")
	save_data_path_local = 'PER/'+args.ENV_NAME+'.json'
elif(args.HER==True and args.PER==True):
	print("\nTraining with Prioritised Hindsight Experience Replay\n")
	save_data_path_local = 'PHER/'+args.ENV_NAME+'.json'

if(args.train):
	""" Start Training (You can always safely abort the training prematurely using Ctrl + C, *once* ) """
	agent.fit(env, nb_steps=args.nb_train_steps, visualize=False, verbose=1, save_data_path=save_data_path_local, file_interval=args.file_interval, nb_max_episode_steps=args.max_step_episode)
def main():
    set_gpu_option()
    # OPTIONS
    ENV_NAME = 'DDPGEnv-v0'
    TIME_STEP = 30

    # Get the environment and extract the number of actions.

    PATH_TRAIN = '/home/data/training_x_150.h5'
    PATH_TEST = '/home/data/test_x_150.h5'
    """
    env = OhlcvEnv(TIME_STEP, path=PATH_TRAIN)
    env_test = OhlcvEnv(TIME_STEP, path=PATH_TEST)
    """
    store = pd.HDFStore(PATH_TRAIN, mode='r')
    varieties_list = store.keys()
    print('varieties_list: ', varieties_list)
    print('num varieties: ', len(varieties_list))
    
    variety = 'RB'
    print('variety: ', variety)
    
    # get selected features
    SELECTED_FACTOR_PATH = '~/feature_selection/根据互信息选出的特征,根据重要性排序.csv'
    selected_factor_df = pd.read_csv(SELECTED_FACTOR_PATH, index_col=0)
    selected_factor_list = selected_factor_df[variety].to_list()
    
    env = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TRAIN, selected_factor_list=selected_factor_list)
    #env_test = DDPGEnv(TIME_STEP, variety=variety, path=PATH_TEST,  selected_factor_list=selected_factor_list)

    # random seed
    np.random.seed(123)
    env.seed(123)

    nb_actions = env.action_space.shape[0]
    print('nb_actions: ', nb_actions)

    print('env.observation_space.shape: ', env.observation_space.shape)
    print('env.observation_space: ', env.observation_space)
    
    # create actor
    actor = create_actor(input_shape=env.shape, nb_actions=nb_actions)
    
    # create critic
    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=env.shape, name='observation_input')
    critic = create_critic(action_input, observation_input)
    


    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and even the metrics!
    memory = SequentialMemory(limit=50000, window_length=TIME_STEP)

    random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
    ddpg = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                  random_process=random_process, gamma=.99, target_model_update=1e-3, processor=DDPGProcessor())
    ddpg.compile(optimizer=Adam(lr=1e-3), metrics=['mae'])

    log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, write_grads=True)
    for _ in range(3):
        ddpg.fit(env, nb_steps=140000, nb_max_episode_steps=140000, visualize=False, verbose=2)

    """
                                              theta=0.15,
                                              mu=0.0,
                                              sigma=0.3)
    agent = DDPGAgent(
        nb_actions=nb_actions,
        actor=actor,
        critic=critic,
        critic_action_input=action_input,
        memory=memory,
        nb_steps_warmup_critic=1000,
        nb_steps_warmup_actor=1000,
        random_process=random_process,
        gamma=0.99,
        target_model_update=1e-3,
    )
    agent.compile(Adam(lr=0.001, clipnorm=1.0), metrics=["mae"])

    # # Okay, now it's time to learn something! We visualize the training here for show, but this
    # # slows down training quite a lot. You can always safely abort the training prematurely using
    # # Ctrl + C.
    agent.fit(env,
              nb_steps=100000,
              visualize=False,
              verbose=1,
              nb_max_episode_steps=288)

    # # After training is done, we save the final weights.
    agent.save_weights("ddpg_{}_weights.h5f".format(ENV_NAME), overwrite=True)

    # # Finally, evaluate our algorithm for 5 episodes.
    agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=288)
Beispiel #30
0
                                          sigma_min=0.01,
                                          n_steps_annealing=2900000)
# define the DDPG agent
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  batch_size=64,
                  memory=memory,
                  nb_steps_warmup_critic=3000,
                  nb_steps_warmup_actor=3000,
                  random_process=random_process,
                  gamma=GAMMA,
                  target_model_update=1e-4)
# compile the model
agent.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mse'])

callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre,
                                        filename_exp)

# ----------------------------------------------------------------------------------------------------------------------------------------
# Training phase

# fitting the agent
# agent.fit(env, nb_steps=3000000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=STEPS_PER_EPISODE,process_noise_std=process_noise_std)

# After training is done, we save the final weights.
# agent.save_weights(log_filename_pre+filename_exp+'/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
# common_func.save_process_noise(ENV_NAME, log_filename_pre, filename_exp, process_noise_std, theta)

#---------------------------------------------------------------------------------------------------------------------------------------
Beispiel #31
0
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                          theta=.15,
                                          mu=0.,
                                          sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

agent.fit(env,
          nb_steps=50000,
          visualize=True,
          verbose=1,
          nb_max_episode_steps=200)

agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
Beispiel #32
0
def get_agent(env, agent_id, model=1):
    global observation_size
    # Count number of actions
    if not ingy:
        nb_actions = env.action_space['action_movement'][0].shape[0] + 2
        # Count number of observations for input
        if observation_size == 0:
            observation_size += env.observation_space[
                'observation_self'].shape[0]
            observation_size += env.observation_space['agent_qpos_qvel'].shape[0] * \
                                env.observation_space['agent_qpos_qvel'].shape[1]
            observation_size += env.observation_space['box_obs'].shape[
                0] * env.observation_space['box_obs'].shape[1]
            observation_size += env.observation_space['ramp_obs'].shape[
                0] * env.observation_space['ramp_obs'].shape[1]
            # TODO: Not sure whether to include mask_a*_obs and mask_ab_obs_spoof in this observation input -AH
    else:
        nb_actions = env.action_space.spaces['action_movement'].spaces[
            0].shape[0][0] + 2
        # Count number of observations for input
        if observation_size == 0:
            observation_size += env.observation_space.spaces[
                'observation_self'].shape[0]
            if 'lidar' in env.observation_space.spaces:
                observation_size += env.observation_space.spaces[
                    'lidar'].shape[0]
            observation_size += env.observation_space.spaces['agent_qpos_qvel'].shape[0] * \
                                env.observation_space.spaces['agent_qpos_qvel'].shape[1]
            observation_size += env.observation_space.spaces['box_obs'].shape[0] * \
                                env.observation_space.spaces['box_obs'].shape[1]
            observation_size += env.observation_space.spaces['ramp_obs'].shape[0] * \
                                env.observation_space.spaces['ramp_obs'].shape[1]

    if model == 1:
        # Build the actor model
        actor = Sequential()
        actor.add(Flatten(input_shape=(
            1,
            observation_size,
        )))
        actor.add(Dense(400))
        actor.add(Activation('relu'))
        actor.add(Dense(300))
        actor.add(Activation('relu'))
        actor.add(Dense(nb_actions))
        actor.add(Activation('sigmoid'))  # Return values from 0 to 1
        # print(actor.summary())

        # Build the critic model
        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(shape=(
            1,
            observation_size,
        ),
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Dense(400)(flattened_observation)
        x = Activation('relu')(x)
        x = Concatenate()([x, action_input])
        x = Dense(300)(x)
        x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)
        critic = Model(inputs=[action_input, observation_input], outputs=x)
        # print(critic.summary())

        # Build the agent
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=2.15,
                                                  mu=0,
                                                  sigma=3)
        agent = DDPGAgent(nb_actions=nb_actions,
                          actor=actor,
                          critic=critic,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=4000,
                          nb_steps_warmup_actor=4000,
                          random_process=random_process,
                          gamma=.9,
                          target_model_update=1e-3,
                          processor=MujocoProcessor(agent_id))
        agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])

    elif model == 2:
        # Build the actor model
        actor = Sequential()
        actor.add(Flatten(input_shape=(
            1,
            observation_size,
        )))
        actor.add(Dense(400))
        actor.add(Activation('relu'))
        actor.add(Dense(300))
        actor.add(Dropout(0.3))
        actor.add(Activation('relu'))
        actor.add(Dense(100))
        actor.add(Dropout(0.2))
        actor.add(Activation('elu'))
        actor.add(Dense(50))
        actor.add(Dropout(0.2))
        actor.add(Activation('elu'))
        actor.add(Dense(nb_actions))
        actor.add(Activation('softmax'))  # Return values from 0 to 1
        # print(actor.summary())

        # Build the critic model
        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(shape=(
            1,
            observation_size,
        ),
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Dense(400)(flattened_observation)
        x = Activation('relu')(x)
        x = Concatenate()([x, action_input])
        x = Dense(300)(x)
        x = Activation('relu')(x)
        x = Dropout(0.3)(x)
        x = Dense(100)(x)
        x = Activation('elu')(x)
        x = Dropout(0.2)(x)
        x = Dense(50)(x)
        x = Activation('elu')(x)
        x = Dropout(0.2)(x)
        x = Dense(1)(x)
        x = Activation('tanh')(x)
        critic = Model(inputs=[action_input, observation_input], outputs=x)
        # print(critic.summary())

        # Build the agent
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=2.8,
                                                  mu=0,
                                                  sigma=3.5)
        agent = DDPGAgent(nb_actions=nb_actions,
                          actor=actor,
                          critic=critic,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=500,
                          nb_steps_warmup_actor=500,
                          random_process=random_process,
                          gamma=.9,
                          target_model_update=5e-2,
                          processor=MujocoProcessor(agent_id))
        agent.compile([Adam(lr=5e-1, decay=0.9),
                       Adam(lr=5e-1, decay=0.9)],
                      metrics=['mae'])

    return agent
x = Activation('linear')(x)
critic = Model(input=[action_input, observation_input], output=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=2*NUM_STEPS, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
# random_process = OrnsteinUhlenbeckProcess(size=nb_actions, dt = env.tau, theta=1.0, mu=0.0, sigma=0.5, sigma_min=0.3, n_steps_annealing=NUM_STEPS)

agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.999, target_model_update=1e-3,
                  delta_clip=1.0)

agent.compile(Adam(lr=.001, clipnorm=1.0), metrics=['mae'])




# Optionally, we can reload a previous model's weights and continue training from there
# Remove the _actor or _critic from the filename. The load method automatically
# appends these.        
WEIGHTS_FILENAME = 'weights/ddpg_planar_crane_continuous-v0_weights.h5f'
# agent.load_weights(WEIGHTS_FILENAME)


callbacks = []
checkpoint_weights_filename = 'weights/ddpg_{}_checkpointWeights_{{step}}_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
log_filename = 'logs/ddpg_{}_log_{}_{}_{}_{}.json'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
#callbacks += [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)]
Beispiel #34
0
x = Dense(400)(flattened_observation)
x = Activation('relu')(x)
x = Concatenate()([x, action_input])
x = Dense(300)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  processor=MujocoProcessor())
agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=1000000, visualize=False, verbose=1)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)