Example #1
0
def visualize(session_name):
    kwargs = {'viewer': True}

    ENV_NAME = 'singlePendulum-v0'
    env = gym.make(ENV_NAME, **kwargs)
    np.random.seed(7)
    env.seed(7)
    assert len(env.action_space.shape) == 1
    nb_actions = env.action_space.shape[0]

    actor, critic, action_input = create_networks(env)

    memory = SequentialMemory(limit=400, window_length=1)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory)
    agent.compile(Adam(lr=.0005,
                       clipnorm=1.,
                       epsilon=1.e-7,
                       beta_1=0.9,
                       beta_2=0.999),
                  metrics=['mae'])

    checkpoint_filepath = 'checkpoint/ddpg_{}_{}_weights.h5f'.format(
        ENV_NAME, session_name)
    filepath = 'ddpg_{}_{}_weights.h5f'.format(ENV_NAME, session_name)
    agent.load_weights(filepath=filepath)

    env.viewer = True
    agent.test(env, nb_episodes=1, visualize=False, nb_max_episode_steps=400)
    env.close()
Example #2
0
def main(args):
    CUDA = torch.cuda.is_available()
    OUTPUT_RESULTS_DIR = './saver'
    ENVIRONMENT = 'SemisuperPendulumRandom-v0'
    TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
    SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "DDPG", ENVIRONMENT,
                               TIMESTAMP)

    env = gym.make(ENVIRONMENT)
    env = wrappers.Monitor(env, SUMMARY_DIR, force=True)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    action_bound = env.action_space.high

    actor = ActorNetwork(state_dim, action_dim, action_bound, args.actor_lr,
                         args.tau, args.seed)
    target_actor = ActorNetwork(state_dim, action_dim, action_bound,
                                args.actor_lr, args.tau, args.seed)
    critic = CriticNetwork(state_dim, action_dim, action_bound, args.critic_lr,
                           args.tau, args.l2_decay, args.seed)
    target_critic = CriticNetwork(state_dim, action_dim, action_bound,
                                  args.critic_lr, args.tau, args.l2_decay,
                                  args.seed)

    if CUDA:
        actor = actor.cuda()
        target_actor = target_actor.cuda()
        critic = critic.cuda()
        target_critic = target_critic.cuda()

    replay_buffer = ReplayBuffer(args.bufferlength, args.seed)

    agent = DDPGAgent(actor,
                      target_actor,
                      critic,
                      target_critic,
                      replay_buffer,
                      batch_size=args.batch_size,
                      gamma=args.gamma,
                      seed=args.seed,
                      episode_len=args.episode_len,
                      episode_steps=args.episode_steps,
                      noise_mean=args.noise_mean,
                      noise_th=args.noise_th,
                      noise_std=args.noise_std,
                      noise_decay=args.noise_decay)

    if args.is_train:
        agent.train(env)
        agent.save_actor_weights(save_dir=OUTPUT_RESULTS_DIR,
                                 filename=args.actor_weights)
    else:
        agent.load_actor_weights(save_dir=OUTPUT_RESULTS_DIR,
                                 filename=args.actor_weights)
        agent.test(env)
def test_ddpg():
    # TODO: replace this with a simpler environment where we can actually test if it finds a solution
    env = gym.make('Pendulum-v0')
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.shape[0]

    actor = Sequential()
    actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))

    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    
    memory = SequentialMemory(limit=1000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                      memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50,
                      random_process=random_process, gamma=.99, target_model_update=1e-3)
    agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)])

    agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100)
    h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
Example #4
0
class DDPG():
    def __init__(self, Env):
        self.env = Env
        nb_actions = self.env.action_space.shape[0]
        actor = Sequential()
        actor.add(Flatten(input_shape=(1,) + self.env.observation_space.shape))
        actor.add(Dense(5))
        actor.add(Activation('relu'))
        actor.add(Dense(8))
        actor.add(Activation('relu'))
        actor.add(Dense(5))
        actor.add(Activation('relu'))
        # actor.add(Dense(16))
        # actor.add(Activation('relu'))
        actor.add(Dense(nb_actions))
        actor.add(Activation('softmax'))
        # print(actor.summary())

        action_input = Input(shape=(nb_actions,), name='action_input')
        observation_input = Input(shape=(1,) + Env.observation_space.shape, name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = concatenate([action_input, flattened_observation], name = 'concatenate')
        x = Dense(5)(x)
        x = Activation('relu')(x)
        x = Dense(8)(x)
        x = Activation('relu')(x)
        x = Dense(5)(x)
        x = Activation('relu')(x)
        # x = Dense(32)(x)
        # x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)
        critic = Model(inputs=[action_input, observation_input], outputs=x)
        # print(critic.summary())

        memory = SequentialMemory(limit=100000, window_length=1)
        # random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
        random_process = None
        self.agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                          memory=memory, nb_steps_warmup_critic=32, nb_steps_warmup_actor=32,
                          random_process=random_process, gamma=0, target_model_update=0.001)
        self.agent.processor = ShowActionProcessor(self.agent, self.env)
        self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

    def fit(self):
        history = self.agent.fit(self.env, action_repetition=1, nb_steps=20000, visualize=False, verbose=1, nb_max_episode_steps=10)
        return history

    def save_weights(self):
        self.agent.save_weights('./store/ddpg_{}_weights2.h5f'.format("porfolio"), overwrite=True)

    def test(self):
        history = self.agent.test(self.env, nb_episodes=1, visualize=False, nb_max_episode_steps=10)
        return history

    def load_weights(self):
        self.agent.load_weights('./store/ddpg_{}_weights2.h5f'.format("porfolio"))
Example #5
0
    def _train(self):
        env = CrazyflieEnvironment(self._cf)
        atexit.register(teardown_env, env, self._cf)

        np.random.seed(123)
        assert len(env.action_space.shape) == 1
        nb_actions = env.action_space.shape[0]

        # Next, we build a very simple model.
        actor = self.actor_model(env, nb_actions)
        action_input, critic = self.critic_model(env, nb_actions)

        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=.15,
                                                  mu=0.,
                                                  sigma=.3)
        model_name = 'ddpg_{}_weights.h5f'.format('crazyflie')
        agent = DDPGAgent(nb_actions=nb_actions,
                          actor=actor,
                          critic=critic,
                          critic_action_input=action_input,
                          memory=memory,
                          nb_steps_warmup_critic=100,
                          nb_steps_warmup_actor=100,
                          random_process=random_process,
                          gamma=.99,
                          target_model_update=1e-3)
        if os.path.exists(model_name):
            agent.load_weights(model_name)
        agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

        try:
            agent.fit(env, nb_steps=50000, verbose=2)
            agent.test(env, nb_episodes=1)
        finally:
            agent.save_weights(model_name, overwrite=True)
Example #6
0
def test_ddpg():
    # TODO: replace this with a simpler environment where we can actually test if it finds a solution
    env = gym.make('Pendulum-v0')
    np.random.seed(123)
    env.seed(123)
    random.seed(123)
    nb_actions = env.action_space.shape[0]

    actor = Sequential()
    actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + env.observation_space.shape,
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)

    memory = SequentialMemory(limit=1000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=50,
                      nb_steps_warmup_actor=50,
                      random_process=random_process,
                      gamma=.99,
                      target_model_update=1e-3)
    agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)])

    agent.fit(env,
              nb_steps=400,
              visualize=False,
              verbose=0,
              nb_max_episode_steps=100)
    h = agent.test(env,
                   nb_episodes=2,
                   visualize=False,
                   nb_max_episode_steps=100)
Example #7
0
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    print(critic.summary())

    plot_model(critic, to_file='critic.png', show_shapes=True)

# # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# # even the metrics!
    memory = SequentialMemory(limit=10000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0., sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                   memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                   random_process=random_process, gamma=.99, target_model_update=1e-3)
    agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

# # Okay, now it's time to learn something! We visualize the training here for show, but this
# # slows down training quite a lot. You can always safely abort the training prematurely using
# # Ctrl + C.
    agent.fit(env, nb_steps=25000, visualize=False, verbose=1, nb_max_episode_steps=200)

# # After training is done, we save the final weights.
    agent.save_weights('ddpg_stokes_weights.h5f', overwrite=True)

# # Finally, evaluate our algorithm for 5 episodes.
    agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)


                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      random_process=random_process,
                      nb_steps_warmup_actor=2048,
                      nb_steps_warmup_critic=1024,
                      target_model_update=1000,
                      gamma=0.95,
                      batch_size=128,
                      memory_interval=1)
    agent.compile((Adam(lr=1e-6), Adam(lr=1e-4)), metrics=['mae'])

    # Start training for 7.5M simulation steps (1.5M training steps with actions repeated 5 times)

    agent.fit(env,
              nb_steps=1500000,
              visualize=True,
              action_repetition=1,
              verbose=1,
              nb_max_start_steps=0,
              nb_max_episode_steps=10000,
              log_interval=10000,
              callbacks=[])

    # Test the agent
    hist = agent.test(env,
                      nb_episodes=10,
                      action_repetition=1,
                      nb_max_episode_steps=5000,
                      visualize=True)
Example #9
0
def main_function(args, data):
    #### INITIALISATION DES CONSTANTES #####
    ## Model ##
    SIZE_HIDDEN_LAYER_ACTOR = data['SIZE_HIDDEN_LAYER_ACTOR'][0]
    LR_ACTOR = data['LR_ACTOR'][0]
    SIZE_HIDDEN_LAYER_CRITIC = data['SIZE_HIDDEN_LAYER_CRITIC'][0]
    LR_CRITIC = data['LR_CRITIC'][0]
    DISC_FACT = data['DISC_FACT'][0]
    TARGET_MODEL_UPDATE = data['TARGET_MODEL_UPDATE'][0]
    BATCH_SIZE = data['BATCH_SIZE'][0]
    REPLAY_BUFFER_SIZE = data['REPLAY_BUFFER_SIZE'][0]
    ## Exploration ##
    THETA = data['THETA'][0]
    SIGMA = data['SIGMA'][0]
    SIGMA_MIN = data['SIGMA_MIN'][0]
    N_STEPS_ANNEALING = data['N_STEPS_ANNEALING'][0]

    ## Acceleration ##
    ACTION_REPETITION = data['ACTION_REPETITION'][0]
    INTEGRATOR_ACCURACY = data['INTEGRATOR_ACCURACY'][0]

    # # Simulation ##
    N_STEPS_TRAIN = int(args.step)
    N_EPISODE_TEST = 100
    if args.visualize:
        N_EPISODE_TEST = 3
    VERBOSE = 1
    # 0: pas de descriptif
    # 1: descriptif toutes les LOG_INTERVAL steps
    # 2: descriptif à chaque épisode
    LOG_INTERVAL = 500

    # Save weights ##
    if not os.path.exists('weights'):
        os.mkdir('weights')
        print("Directory ", 'weights', " Created ")
    FILES_WEIGHTS_NETWORKS = './weights/' + args.model + '.h5f'

    # #### CHARGEMENT DE L'ENVIRONNEMENT #####
    if args.prosthetic:
        env = ProsContinueRewardWrapper(
            ProstheticsEnv(visualize=args.visualize,
                           integrator_accuracy=INTEGRATOR_ACCURACY))
    if not args.prosthetic:
        env = CustomDoneOsimWrapper(
            CustomRewardWrapper(
                RelativeMassCenterObservationWrapper(
                    NoObstacleObservationWrapper(
                        L2RunEnv(visualize=args.visualize,
                                 integrator_accuracy=0.005)))))

    env.reset()
    # Examine the action space ##
    action_size = env.action_space.shape[0]
    #action_size = int(env.action_space.shape[0]/2)    pour la symmétrie
    print('Size of each action:', action_size)

    # Examine the state space ##
    state_size = env.observation_space.shape[0]
    print('Size of state:', state_size)

    # #### ACTOR / CRITIC #####

    # Actor (mu) ##
    if args.prosthetic:
        input_shape = (1, env.observation_space.shape[0])
    if not args.prosthetic:
        input_shape = (1, env.observation_space.shape[0])

    observation_input = Input(shape=input_shape, name='observation_input')

    x = Flatten()(observation_input)
    x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x)
    x = Activation('relu')(x)
    x = Dense(action_size)(x)
    x = Activation('sigmoid')(x)

    actor = Model(inputs=observation_input, outputs=x)
    opti_actor = Adam(lr=LR_ACTOR)

    # Critic (Q) ##
    action_input = Input(shape=(action_size, ), name='action_input')

    x = Flatten()(observation_input)
    x = concatenate([action_input, x])
    x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x)
    x = Activation('relu')(x)
    x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)

    critic = Model(inputs=[action_input, observation_input], outputs=x)

    opti_critic = Adam(lr=LR_CRITIC)

    # #### SET UP THE AGENT #####
    # Initialize Replay Buffer ##
    memory = SequentialMemory(limit=REPLAY_BUFFER_SIZE, window_length=1)

    # Random process (exploration) ##
    random_process = OrnsteinUhlenbeckProcess(
        theta=THETA,
        mu=0,
        sigma=SIGMA,
        sigma_min=SIGMA_MIN,
        size=action_size,
        n_steps_annealing=N_STEPS_ANNEALING)

    # random_process_l = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN,
    #                                           size=action_size, n_steps_annealing=N_STEPS_ANNEALING)
    # random_process_r = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN,
    #                                           size=action_size, n_steps_annealing=N_STEPS_ANNEALING)

    # Paramètres agent DDPG ##
    # agent = SymmetricDDPGAgent(nb_actions=action_size, actor=actor, critic=critic,
    #                            critic_action_input=action_input,
    #                            memory=memory, random_process_l=random_process_l, random_process_r=random_process_r,
    #                            gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE,
    #                            batch_size=BATCH_SIZE)

    agent = DDPGAgent(nb_actions=action_size,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      random_process=random_process,
                      gamma=DISC_FACT,
                      target_model_update=TARGET_MODEL_UPDATE,
                      batch_size=BATCH_SIZE)

    agent.compile(optimizer=[opti_critic, opti_actor])

    # #### TRAIN #####
    logdir = "keras_logs/" + datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
    robustensorboard = RobustTensorBoard(log_dir=logdir, hyperparams=data)
    saveBest = SaveBestEpisode()
    if args.train:
        if args.resume:
            agent.load_weights(FILES_WEIGHTS_NETWORKS)
        else:
            check_overwrite(args.model)

        agent.fit(env,
                  nb_steps=N_STEPS_TRAIN,
                  visualize=args.visualize,
                  verbose=VERBOSE,
                  log_interval=LOG_INTERVAL,
                  callbacks=[robustensorboard, saveBest],
                  action_repetition=ACTION_REPETITION)

        agent.save_weights(FILES_WEIGHTS_NETWORKS, overwrite=True)

    #### TEST #####
    if not args.train:
        agent.load_weights(FILES_WEIGHTS_NETWORKS)
        agent.test(env, nb_episodes=N_EPISODE_TEST, visualize=args.visualize)
'''
# agent.load_weights('fit-weights.h5f')
'''
fit
'''
history = agent.learning(env,
                         policy,
                         policy_list,
                         nb_steps=1e7,
                         visualize=False,
                         log_interval=1000,
                         verbose=1,
                         nb_max_episode_steps=4000,
                         imitation_leaning_time=1e4,
                         reinforcement_learning_time=9e4)
# plt.plot(history.history['metrics'])
# plt.plot(history.history['reward'])
# plt.show()

sio.savemat(ENV_NAME + '-' + nowtime + '/fit.mat', history.history)
# After training is done, we save the final weights.
agent.save_weights(ENV_NAME + '-' + nowtime + '/fit-weights.h5f',
                   overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
history = agent.test(env,
                     nb_episodes=10,
                     visualize=True,
                     nb_max_episode_steps=5000)
sio.savemat(ENV_NAME + '-' + nowtime + '/test-final.mat', history.history)
Example #11
0
## Initialize Replay Buffer ##
memory = SequentialMemory(limit=REPLAY_BUFFER_SIZE, window_length=1)
# window_length : usefull for Atari game (cb d'images d'affilé on veut analysé (vitesse de la balle, etc..))

## Random process (exploration) ##
random_process = OrnsteinUhlenbeckProcess(theta=THETA, mu=MEAN, sigma=SIGMA, size=action_size)

## Paramètres agent DDPG ##
agent = DDPGAgent(nb_actions=action_size, actor=actor, critic=critic, 
    critic_action_input=action_input,
    memory=memory, random_process=random_process, 
    gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE, 
    batch_size= BATCH_SIZE)

agent.compile(optimizer = [opti_critic, opti_actor], metrics= ['mae'])


##### TRAIN #####
if args.train:
    check_overwrite(args.model)
    history = agent.fit(env, nb_steps=N_STEPS_TRAIN, visualize=args.visualize, verbose=VERBOSE, log_interval = LOG_INTERVAL)
    agent.save_weights(FILES_WEIGHTS_NETWORKS, overwrite=True)
    save_plot_reward(history, args.model, params) 


##### TEST #####
if not args.train :
    agent.load_weights(FILES_WEIGHTS_NETWORKS)
    history = agent.test(env, nb_episodes=N_EPISODE_TEST, visualize=args.visualize)
    save_result(history, args.model, params) 
Example #12
0
                                          mu=0.,
                                          sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  nb_steps_warmup_critic=100,
                  nb_steps_warmup_actor=100,
                  random_process=random_process,
                  gamma=.99,
                  target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env,
          nb_steps=50000,
          visualize=False,
          verbose=1,
          log_interval=50,
          nb_max_episode_steps=None)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
#agent.load_weights('ddpg_Reacher-v2_weights_128.h5f')

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=30, visualize=True, nb_max_episode_steps=None)
Example #13
0
                  random_process=random_process,
                  gamma=GAMMA,
                  target_model_update=1e-4)
# compile the model
agent.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mse'])

callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre,
                                        filename_exp)

# ----------------------------------------------------------------------------------------------------------------------------------------
# Training phase

# fitting the agent
# agent.fit(env, nb_steps=3000000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=STEPS_PER_EPISODE,process_noise_std=process_noise_std)

# After training is done, we save the final weights.
# agent.save_weights(log_filename_pre+filename_exp+'/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
# common_func.save_process_noise(ENV_NAME, log_filename_pre, filename_exp, process_noise_std, theta)

#---------------------------------------------------------------------------------------------------------------------------------------
# Testing phase
agent.load_weights(log_filename_pre + filename_exp +
                   '/ddpg_{}_weights.h5f'.format(ENV_NAME))

# # Finally, evaluate our algorithm.
history, state_history_nominal, episode_reward_nominal, action_history = agent.test(env, nb_episodes=1, visualize=True, action_repetition=1, \
  nb_max_episode_steps=STEPS_PER_EPISODE,  initial_state=np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]), \
  std_dev_noise=0, gamma=GAMMA)
# print(episode_reward_nominal, state_history_nominal)
# -----------------------------------------------------------------------------------------------------------------------------------------
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
if args.train:
    agent.load_weights('aviral_jump_new.h5f')
    print 'weights loaded'
    agent.fit(env, nb_steps=nallsteps, visualize=True, verbose=1, nb_max_episode_steps=1000, log_interval=1000)
    print 'TRAINED THE MODELS'
    # After training is done, we save the final weights.
    agent.save_weights(args.model, overwrite=True)

if not args.train:
    print args.model
    agent.load_weights(args.model)
    # sys.exit(0)
    # Finally, evaluate our algorithm for 1 episode.
    h = Histories()
    agent.test(env, nb_episodes=10, visualize=False, nb_max_episode_steps=1000, action_repetition=2, callbacks=[h])
    # print h.action_list
    f = open('values_jump_new.txt', 'w')
    # f.write(str(h.action_list)
    pickle.dump(h.action_dict_list, f)
    f.close()
    print("done pickling")
    # for i in range(600):
    #     ac = agent.forward(obs)
    #     f.write(str(ac))
    #     f.write('\n\n\n')
    #     obs, rew, _, _ = env.step(ac)

    # f.close()
    x = Activation('relu')(x)

# Output Layer
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(input=[action_input, observation_input], output=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=2*NUM_STEPS, window_length=1)
# random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, dt = env.tau, theta=0.6, mu=0.0, sigma=0.5, sigma_min=0.15, n_steps_annealing=NUM_STEPS)

agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.999, target_model_update=1e-3,
                  delta_clip=1.0)

agent.compile(Adam(lr=.001, clipnorm=1.0), metrics=['mae'])



# Load the model weights - this method will automatically load the weights for
# both the actor and critic
agent.load_weights(FILENAME)


# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True,action_repetition=5) #nb_max_episode_steps=500, 
Example #16
0
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Set up the agent for training
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  delta_clip=1.)
# agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model,
#                            memory=memory, nb_steps_warmup=1000, random_process=random_process,
#                            gamma=.99, target_model_update=0.1)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
if args.train:
    agent.fit(env, nb_steps=nallsteps, visualize=False, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000)
    # After training is done, we save the final weights.
    agent.save_weights(args.model, overwrite=True)

if not args.train:
    agent.load_weights(args.model)
    # Finally, evaluate our algorithm for 1 episode.
    agent.test(env, nb_episodes=1, visualize=False, nb_max_episode_steps=500)
Example #17
0
def train():
    # Get the environment and extract the number of actions.
    env = gym.make(ENV_NAME)
    np.random.seed(123)
    env.seed(123)
    assert len(env.action_space.shape) == 1
    nb_actions = env.action_space.shape[0]

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Next, we build a very simple model.
    actor = Sequential()
    actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(16))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))
    # print(actor.summary())

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + env.observation_space.shape,
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(32)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    # print(critic.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=100000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                              theta=.15,
                                              mu=0.,
                                              sigma=.3)

    if REWARD == "normal":
        ddpg_normal = DDPGAgent(nb_actions=nb_actions,
                                actor=actor,
                                critic=critic,
                                critic_action_input=action_input,
                                memory=memory,
                                nb_steps_warmup_critic=100,
                                nb_steps_warmup_actor=100,
                                random_process=random_process,
                                gamma=.99,
                                target_model_update=1e-3)
        ddpg_normal.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])

        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        history_normal = ddpg_normal.fit(env,
                                         nb_steps=150000,
                                         visualize=False,
                                         verbose=2,
                                         nb_max_episode_steps=200)

        # After training is done, we save the final weights.
        ddpg_normal.save_weights(os.path.join(
            LOG_DIR, 'ddpg_normal_{}_weights.h5f'.format(ENV_NAME)),
                                 overwrite=True)
        # Finally, evaluate our algorithm for 5 episodes.
        ddpg_normal.test(env,
                         nb_episodes=5,
                         visualize=False,
                         verbose=2,
                         nb_max_episode_steps=200)

        pandas.DataFrame(history_normal.history).to_csv(
            os.path.join(LOG_DIR, "normal.csv"))

    elif REWARD == "noisy":
        processor_noisy = PendulumSurrogateProcessor(weight=WEIGHT,
                                                     surrogate=False,
                                                     noise_type=NOISE_TYPE)
        ddpg_noisy = DDPGAgent(nb_actions=nb_actions,
                               actor=actor,
                               critic=critic,
                               critic_action_input=action_input,
                               memory=memory,
                               nb_steps_warmup_critic=100,
                               nb_steps_warmup_actor=100,
                               random_process=random_process,
                               gamma=.99,
                               target_model_update=1e-3,
                               processor=processor_noisy)
        ddpg_noisy.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])
        history_noisy = ddpg_noisy.fit(env,
                                       nb_steps=150000,
                                       visualize=False,
                                       verbose=2,
                                       nb_max_episode_steps=200)
        ddpg_noisy.save_weights(os.path.join(
            LOG_DIR, 'ddpg_noisy_{}_weights.h5f'.format(ENV_NAME)),
                                overwrite=True)
        ddpg_noisy.test(env,
                        nb_episodes=5,
                        visualize=False,
                        verbose=2,
                        nb_max_episode_steps=200)

        pandas.DataFrame(history_noisy.history).to_csv(
            os.path.join(LOG_DIR, "noisy.csv"))

    elif REWARD == "surrogate":
        processor_surrogate = PendulumSurrogateProcessor(weight=WEIGHT,
                                                         surrogate=True,
                                                         noise_type=NOISE_TYPE)
        ddpg_surrogate = DDPGAgent(nb_actions=nb_actions,
                                   actor=actor,
                                   critic=critic,
                                   critic_action_input=action_input,
                                   memory=memory,
                                   nb_steps_warmup_critic=100,
                                   nb_steps_warmup_actor=100,
                                   random_process=random_process,
                                   gamma=.99,
                                   target_model_update=1e-3,
                                   processor=processor_surrogate)
        ddpg_surrogate.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae'])
        history_surrogate = ddpg_surrogate.fit(env,
                                               nb_steps=150000,
                                               visualize=False,
                                               verbose=2,
                                               nb_max_episode_steps=200)

        ddpg_surrogate.save_weights(os.path.join(
            LOG_DIR, 'ddpg_surrogate_{}_weights.h5f'.format(ENV_NAME)),
                                    overwrite=True)
        ddpg_surrogate.test(env,
                            nb_episodes=5,
                            visualize=False,
                            verbose=2,
                            nb_max_episode_steps=200)

        pandas.DataFrame(history_surrogate.history).to_csv(
            os.path.join(LOG_DIR, "surrogate.csv"))

    else:
        raise NotImplementedError
def evaluate_model(model_path=None, interactive=False, seed=12345):
    np.random.seed(seed)

    actor, critic, action_input = define_actor_critic_models(actions=3)
    memory = SequentialMemory(limit=10000, window_length=1)
    random_process = GaussianWhiteNoiseProcess(mu=0,
                                               sigma=0,
                                               sigma_min=0,
                                               n_steps_annealing=1)

    agent = DDPGAgent(nb_actions=3,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=500,
                      nb_steps_warmup_actor=100,
                      random_process=random_process,
                      gamma=.95,
                      target_model_update=0.0001,
                      batch_size=32)
    agent.compile([RMSprop(lr=.0001), RMSprop(lr=.01)], metrics=['mae'])

    if model_path is not None:
        agent.load_weights(model_path)

    # Train Evaluation
    env = CameraControlEnvCont(dataset_pickle_path='data/dataset.pickle',
                               testing=False,
                               interactive=interactive)
    env.seed(seed)
    res = agent.test(env,
                     nb_episodes=500,
                     nb_max_episode_steps=100,
                     verbose=0,
                     visualize=False)
    train_mean_reward = np.mean(res.history['episode_reward'])
    before_train_position_error = np.mean(
        np.abs(env.init_position_error_pixels))
    before_train_zoom_error = np.mean(np.abs(env.init_zoom_error_pixels))
    after_train_position_error = np.mean(
        np.abs(env.final_position_error_pixels))
    after_train_zoom_error = np.mean(np.abs(env.final_zoom_error_pixels))
    print("Training evaluation: ")
    print("Mean reward: ", train_mean_reward)
    print("Position: ", before_train_position_error, " -> ",
          after_train_position_error)
    print("Zoom: ", before_train_zoom_error, " -> ", after_train_zoom_error)

    # Test Evaluation
    env = CameraControlEnvCont(dataset_pickle_path='data/dataset.pickle',
                               testing=True,
                               interactive=interactive)
    env.seed(seed)
    res = agent.test(env,
                     nb_episodes=500,
                     nb_max_episode_steps=100,
                     verbose=0,
                     visualize=False)
    train_mean_reward = np.mean(res.history['episode_reward'])
    before_train_position_error = np.mean(
        np.abs(env.init_position_error_pixels))
    before_train_zoom_error = np.mean(np.abs(env.init_zoom_error_pixels))
    after_train_position_error = np.mean(
        np.abs(env.final_position_error_pixels))
    after_train_zoom_error = np.mean(np.abs(env.final_zoom_error_pixels))
    print("Testing evaluation: ")
    print("Mean reward: ", train_mean_reward)
    print("Position: ", before_train_position_error, " -> ",
          after_train_position_error)
    print("Zoom: ", before_train_zoom_error, " -> ", after_train_zoom_error)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

n = DroneNetwork(nb_actions=nb_actions,
                 observation_shape=env.observation_space.shape)

# Next, we build a very simple model.
actor = n.create_actor()
critic = n.create_critic()
action_input = n.get_action_input()

actor.summary()
critic.summary()
print(action_input)

memory = SequentialMemory(limit=100000, window_length=1)

agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory)

agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

agent.load_weights('ddpg_{}_weights.h5f'.format('drone'))
agent.test(env, nb_episodes=100000, visualize=True)
#agent.test(env, nb_episodes=20, visualize=True, nb_max_episode_steps=50)
env.close()
Example #20
0
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Set up the agent for training
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3,
                  delta_clip=1.)
# agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model,
#                            memory=memory, nb_steps_warmup=1000, random_process=random_process,
#                            gamma=.99, target_model_update=0.1)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
if args.train:
    agent.fit(env, nb_steps=nallsteps, visualize=False, verbose=1, nb_max_episode_steps=200, log_interval=10000)
    # After training is done, we save the final weights.
    agent.save_weights(args.model, overwrite=True)

if not args.train:
    agent.load_weights(args.model)
    # Finally, evaluate our algorithm for 1 episode.
    agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=1000)
                                           sigma=0.8,
                                           sigma_min=0.05,
                                           n_steps_annealing=650000)

# Create the agent
agent = DDPGAgent(nb_actions=nb_actions,
                  actor=actor,
                  critic=critic,
                  critic_action_input=action_input,
                  memory=memory,
                  random_process=random_process,
                  nb_steps_warmup_actor=32,
                  nb_steps_warmup_critic=32,
                  target_model_update=1e-4,
                  gamma=0.9,
                  batch_size=32)
agent.compile(Adam(lr=1e-4), metrics=['mae'])

# Start training for 7.5M simulation steps (1.5M training steps with actions repeated 5 times)
agent.fit(env,
          nb_steps=1500000,
          visualize=False,
          action_repetition=5,
          verbose=2,
          nb_max_start_steps=0,
          log_interval=10000,
          callbacks=[])

# Test the agent
hist = agent.test(env, nb_episodes=10, action_repetition=1, visualize=True)
callbacks = build_callbacks(ENV_NAME)
test_callbacks = build_test_callbacks(ENV_NAME)

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
#agent.fit(env, nb_steps=500000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=30)

# After training is done, we save the final weights.
#agent.save_weights('results/InvertedPendulum/exp_6/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
agent.load_weights(
    'results/InvertedPendulum/exp_6/ddpg_{}_weights.h5f'.format(ENV_NAME))

# Finally, evaluate our algorithm for 5 episodes.
history, state_history_nominal, episode_reward_nominal = agent.test(env, nb_episodes=1, visualize=True, action_repetition=1, callbacks=test_callbacks, nb_max_episode_steps=30, \
                                                         initial_state=[0, np.pi, 0, 0], std_dev_noise=0, gamma=GAMMA)
u_max = 12
print(episode_reward_nominal, state_history_nominal)
'''
f = open("results/InvertedPendulum/exp_3/data.txt", "a")

for i in frange(0.0, 1.0, 0.05):
    episode_reward_n = 0
    Var_n = 0
    terminal_mse = 0
    Var_terminal_mse = 0
    for j in range(n_samples):

        history, state_history, episode_reward = agent.test(env, nb_episodes=1, visualize=False, action_repetition=1, nb_max_episode_steps=30, initial_state=[0, np.pi, 0, 0], std_dev_noise=i*u_max, gamma=GAMMA)
        episode_reward_n += episode_reward
        Var_n += (episode_reward)**2
Example #23
0
    logger.info('Iteration #{}'.format(n))

    #train
    train_history = agent.fit(env,
                              nb_steps=nb_stepis,
                              visualize=False,
                              verbose=1,
                              nb_max_episode_steps=nb_stepis)

    # After training is done, we save the final weights.
    agent.save_weights('ddpg_{}_nomad_v3_weights.h5f'.format(ENV_NAME),
                       overwrite=True)

    # Save memory
    pickle.dump(memory, open("memory2.pkl", "wb"))

    # Finally, evaluate our algorithm for nb_episodes episodes.
    test_history = agent.test(env,
                              nb_episodes=nb_episodes,
                              visualize=False,
                              nb_max_episode_steps=nb_stepis)

    #loading weights and model and logging taken from:
    #https://github.com/olavt/gym_co2_ventilation/blob/master/examples/test_keras_rl_continious.py
    train_rewards = train_history.history['episode_reward']
    test_rewards = test_history.history['episode_reward']
    for i in range(0, nb_episodes):
        episode_logger.info('{},{},{}'.format(((n - 1) * nb_episodes + i + 1),
                                              train_rewards[i],
                                              test_rewards[i]))
class KerasDDPGAgent(object):
    '''
    classdocs
    '''
    def __init__(self, opts):
        self.metadata = {'discrete_actions': False}

        self.opts = opts

    def configure(self, observation_space_shape, nb_actions):
        # Next, we build a simple model.
        # actor network
        actor = Sequential()
        actor.add(Flatten(input_shape=(1, ) + observation_space_shape))
        actor.add(Dense(16))
        actor.add(Activation('relu'))
        actor.add(Dense(16))
        actor.add(Activation('relu'))
        actor.add(Dense(16))
        actor.add(Activation('relu'))
        actor.add(Dense(nb_actions))
        actor.add(Activation('linear'))
        print(actor.summary())

        # critic network
        action_input = Input(shape=(nb_actions, ), name='action_input')
        observation_input = Input(shape=(1, ) + observation_space_shape,
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = concatenate([action_input, flattened_observation])
        x = Dense(32)(x)
        x = Activation('relu')(x)
        x = Dense(32)(x)
        x = Activation('relu')(x)
        x = Dense(32)(x)
        x = Activation('relu')(x)
        x = Dense(1)(x)
        x = Activation('linear')(x)
        critic = Model(input=[action_input, observation_input], output=x)
        print(critic.summary())

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                                  theta=.15,
                                                  mu=0.,
                                                  sigma=.3)
        self.agent = DDPGAgent(nb_actions=nb_actions,
                               actor=actor,
                               critic=critic,
                               critic_action_input=action_input,
                               memory=memory,
                               nb_steps_warmup_critic=100,
                               nb_steps_warmup_actor=100,
                               random_process=random_process,
                               gamma=.99,
                               target_model_update=1e-3)
        self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

    def train(self, env, nb_steps, visualize, verbosity):
        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        self.agent.fit(env,
                       nb_steps=nb_steps,
                       visualize=visualize,
                       verbose=verbosity,
                       nb_max_episode_steps=200)

    def test(self, env, nb_episodes, visualize):
        # Finally, evaluate our algorithm for 5 episodes.
        self.agent.test(env,
                        nb_episodes=nb_episodes,
                        visualize=visualize,
                        nb_max_episode_steps=200)

    def load_weights(self, load_file):
        self.agent.load_weights(load_file)

    def save_weights(self, save_file, overwrite):
        self.agent.save_weights(save_file, overwrite=True)
Example #25
0
callbacks = []
checkpoint_weights_filename = 'weights/ddpg_{}_checkpointWeights_{{step}}_{}_{}_{}_{}.h5f'.format(
    ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
log_filename = 'logs/ddpg_{}_log_{}_{}_{}_{}.json'.format(
    ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
#callbacks += [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)]
callbacks += [FileLogger(log_filename, interval=100)]

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env,
          nb_steps=NUM_STEPS,
          callbacks=callbacks,
          visualize=False,
          verbose=1)  #, nb_max_episode_steps=500)

# After training is done, we save the final weights.
filename = 'weights/ddpg_{}_weights_{}_{}_{}_{}.h5f'.format(
    ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
agent.save_weights(filename, overwrite=True)

# We'll also save a simply named version to make running test immediately
# following training easier.
filename = 'weights/ddpg_{}_weights.h5f'.format(ENV_NAME)
agent.save_weights(filename, overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, visualize=True)  #nb_max_episode_steps=500,
Example #26
0
        else:
            agent.load_weights('/home/bdb3m/swmm_rl/agent_weights/ddpg_swmm_weights.h5f')
            agent.fit(env, nb_steps=train_steps, verbose=0)
            agent.save_weights('/home/bdb3m/swmm_rl/agent_weights/ddpg_swmm_weights.h5f', overwrite=True)
            env.close()

        if file_num % 10 == 0:
            print("finished training on ", file_num, " files")
        file_num += 1

# loop through testing envs
for file in os.scandir("/home/bdb3m/swmm_rl/syn_inp_test"):
    if file.name.endswith('.inp'):
        print('testing ', file.name)
        env = BasicEnv(inp_file=file.path, depth=depth)
        history = agent.test(env, nb_episodes=1, visualize=False, nb_max_start_steps=0)
        env.close()

        # get rain/tide data from inp file
        rain_str = []
        tide_str = []
        with open(file.path, 'r') as tmp_file:
            lines = tmp_file.readlines()
            for i, l in enumerate(lines):
                if l.startswith("[TIMESERIES]"):  # find time series section
                    start = i + 3
        for i, l in enumerate(lines[start:]):
            if l.startswith('Atlas14'):
                rain_str.append(l)
            if l.startswith('Tide1'):
                tide_str.append(l)
Example #27
0



# Create Actor and Critic networks
k.clear_session()
actor = get_actor(obs_n, actions_n)
critic, action_input = get_critic(obs_n, actions_n)
print(actor.summary())
print(critic.summary())

memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=actions_n, theta=.15, mu=0., sigma=.1)
agent = DDPGAgent(nb_actions=actions_n[0], actor=actor, critic=critic, batch_size=64, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000,
                  random_process=random_process, gamma=.99)

agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mse'])

#agent.load_weights('ddpg_' + ENV_NAME + 'weights.h5f')
agent.fit(env, env_name=ENV_NAME, nb_steps=500000, action_repetition=5, visualize=False, verbose=1)



env = wrappers.Monitor(env,'/home/wolfie/PycharmProjects/pythonProject/ddpg_halfcheetah',
                       video_callable=lambda episode_id: True, force=True)


agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=1000, verbose=1)

p.disconnect()
Example #28
0
# random process for exploration noise
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=theta, dt=0.01, mu=0., sigma=.25)
# define the DDPG agent
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=GAMMA, target_model_update=1e-3)
# compile the model
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mse'])

callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre, filename_exp)

# ----------------------------------------------------------------------------------------------------------------------------------------
# Training phase

# fitting the agent
#agent.fit(env, nb_steps=800000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=900)

# After training is done, we save the final weights.
#agent.save_weights('../results/Swimmer6/exp_1/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)

# -----------------------------------------------------------------------------------------------------------------------------------------
# Testing phase
agent.load_weights(log_filename_pre+filename_exp +'/ddpg_{}_weights.h5f'.format(ENV_NAME))
history, state_history_nominal, episode_reward_nominal, action_history = agent.test(env, nb_episodes=1, visualize=True, action_repetition=1, nb_max_episode_steps=STEPS_PER_EPISODE, \
                                                         initial_state=np.zeros((16,)), std_dev_noise=20, gamma=GAMMA, process_noise_std=process_noise_std)

# np.savetxt(log_filename_pre+filename_exp+'/s3_nominal_action.txt', action_history)
# np.savetxt(log_filename_pre+filename_exp+'/s3_nominal_state.txt', state_history_nominal)

print(state_history_nominal,action_history)
# -----------------------------------------------------------------------------------------------------------------------------------------
Example #29
0
def run_ddpg():

    global N_NODE_NETWORK

    env = SnakeGymContinuous()
    assert len(env.action_space.shape) == 1
    nb_actions = env.action_space.shape[0]

    # initialize randomness
    np.random.seed(123)
    env.seed(123)

    # Next, we build a very simple model.
    actor = Sequential()
    actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    actor.add(Dense(N_NODE_NETWORK))
    actor.add(Activation('relu'))
    actor.add(Dense(N_NODE_NETWORK))
    actor.add(Activation('relu'))
    actor.add(Dense(N_NODE_NETWORK))
    actor.add(Activation('relu'))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))
    print(actor.summary())

    action_input = Input(shape=(nb_actions, ), name='action_input')
    observation_input = Input(shape=(1, ) + env.observation_space.shape,
                              name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = Concatenate()([action_input, flattened_observation])
    x = Dense(N_NODE_NETWORK * 2)(x)
    x = Activation('relu')(x)
    x = Dense(N_NODE_NETWORK * 2)(x)
    x = Activation('relu')(x)
    x = Dense(N_NODE_NETWORK * 2)(x)
    x = Activation('relu')(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(inputs=[action_input, observation_input], outputs=x)
    print(critic.summary())

    memory = SequentialMemory(limit=100000, window_length=1)
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions,
                                              theta=.15,
                                              mu=0.,
                                              sigma=.3)
    agent = DDPGAgent(nb_actions=nb_actions,
                      actor=actor,
                      critic=critic,
                      critic_action_input=action_input,
                      memory=memory,
                      nb_steps_warmup_critic=500,
                      nb_steps_warmup_actor=500,
                      random_process=random_process,
                      gamma=.99,
                      target_model_update=1e-3)

    agent.compile('adam', metrics=['mae'])

    agent.fit(env,
              nb_steps=50000,
              visualize=True,
              verbose=2,
              nb_max_episode_steps=200)
    agent.save_weights('ddpg_SnakeGymContinuous_weights.h5f', overwrite=True)

    agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
# Optionally, we can reload a previous model's weights and continue training from there
# Remove the _actor or _critic from the filename. The load method automatically
# appends these.        
WEIGHTS_FILENAME = 'weights/ddpg_planar_crane_continuous-v0_weights.h5f'
# agent.load_weights(WEIGHTS_FILENAME)


callbacks = []
checkpoint_weights_filename = 'weights/ddpg_{}_checkpointWeights_{{step}}_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
log_filename = 'logs/ddpg_{}_log_{}_{}_{}_{}.json'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
#callbacks += [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)]
callbacks += [FileLogger(log_filename, interval=100)]

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=NUM_STEPS, callbacks=callbacks, visualize=False, verbose=1)#, nb_max_episode_steps=500)

# After training is done, we save the final weights.
filename = 'weights/ddpg_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID)
agent.save_weights(filename, overwrite=True)

# We'll also save a simply named version to make running test immediately
# following training easier. 
filename = 'weights/ddpg_{}_weights.h5f'.format(ENV_NAME)
agent.save_weights(filename, overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, visualize=True) #nb_max_episode_steps=500,
Example #31
0
def train_with_params(sigma_v = 0., sigma_o = 0.,test=False):

    ENV_NAME = 'PongSolo'
    conf_name = '{}_sv_{}_so_{}'.format(ENV_NAME,sigma_v,sigma_o) # sv, so = sigma_v et sigma_orientation

    # Get the environment and extract the number of actions.
    env = EnvPongSolo(sigma_v = sigma_v, sigma_o = sigma_v)
    np.random.seed(123)

    #assert len(env.action_space.shape) == 1
    nb_actions = 1
    leaky_alpha = 0.2

    # Next, we build a very simple model.
    actor = Sequential()
    actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    actor.add(Dense(100))
    actor.add(LeakyReLU(leaky_alpha))
    actor.add(Dense(nb_actions))
    actor.add(Activation('linear'))
    print(actor.summary())

    action_input = Input(shape=(nb_actions,), name='action_input')
    observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
    flattened_observation = Flatten()(observation_input)
    x = merge([action_input, flattened_observation], mode='concat')
    x = Dense(200)(x)
    x = LeakyReLU(leaky_alpha)(x)
    x = Dense(1)(x)
    x = Activation('linear')(x)
    critic = Model(input=[action_input, observation_input], output=x)
    print(critic.summary())

    # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
    # even the metrics!
    memory = SequentialMemory(limit=100000, window_length=1)
    n_steps = 5000000
    random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=1., mu=0., sigma=.3, sigma_min=0.01, n_steps_annealing=n_steps)
    agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                      memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                      random_process=random_process, gamma=.99, target_model_update=1e-3)
    agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

    # Okay, now it's time to learn something! We visualize the training here for show, but this
    # slows down training quite a lot. You can always safely abort the training prematurely using
    # Ctrl + C.

    directory_weights = "weights/ddpg/{}".format(conf_name)

    if not os.path.exists(directory_weights):
        os.makedirs(directory_weights)

    if test == False:
        perfCheckPoint = ModelPerformanceCheckpoint('{}/checkpoint_avg{}_steps{}'.format(directory_weights,'{}','{}'), 800)
        agent.fit(env, nb_steps=n_steps, visualize=False, verbose=2, nb_max_episode_steps=200,callbacks=[perfCheckPoint])

        # After training is done, we save the final weights.
        agent.save_weights('{}/final.h5f'.format(directory_weights), overwrite=True)

        # Finally, evaluate our algorithm for 5 episodes.
        agent.test(env, nb_episodes=100, visualize=False, nb_max_episode_steps=200)
    else:
        agent.load_weights('{}/final.h5f'.format(directory_weights))
        agent.test(env, nb_episodes=1000, visualize=False, nb_max_episode_steps=200)
Example #32
0
class DDPG:
    """Deep Deterministic Policy Gradient Class

        This is an implementation of DDPG for continuous control tasks made using the high level keras-rl library.

        Args:
            env_name (str): Name of the gym environment
            weights_dir (str): Dir for storing model weights (for both actors and critic as separate files)
            actor_layers (list(int)): A list of int representing neurons in each subsequent the hidden layer in actor
            critic_layers (list(int)): A list of int representing neurons in each subsequent the hidden layer in actor
            n_episodes (int): Maximum training eprisodes
            visualize (bool): Whether a popup window with the environment view is required
    """
    def __init__(self,
                 env_name='MountainCarContinuous-v0',
                 weights_dir="model_weights",
                 actor_layers=[64, 64, 32],
                 critic_layers=[128, 128, 64],
                 n_episodes=200,
                 visualize=True):
        self.env_name = env_name
        self.env = gym.make(env_name)
        np.random.seed(123)
        self.env.seed(123)
        self.actor_layers = actor_layers
        self.critic_layers = critic_layers
        self.n_episodes = n_episodes
        self.visualize = visualize
        self.n_actions = self.env.action_space.shape[0]
        self.n_states = self.env.observation_space.shape
        self.weights_file = os.path.join(
            weights_dir, 'ddpg_{}_weights.h5f'.format(self.env_name))
        self.actor = None
        self.critic = None
        self.agent = None
        self.action_input = None

    def _make_actor(self):
        """Internal helper function to create an actor custom model
        """
        self.actor = Sequential()
        self.actor.add(Flatten(input_shape=(1, ) + self.n_states))
        for size in self.actor_layers:
            self.actor.add(Dense(size, activation='relu'))
        self.actor.add(Dense(self.n_actions, activation='linear'))
        self.actor.summary()

    def _make_critic(self):
        """Internal helper function to create an actor custom model
        """
        action_input = Input(shape=(self.n_actions, ), name='action_input')
        observation_input = Input(shape=(1, ) + self.n_states,
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        input_layer = Concatenate()([action_input, flattened_observation])
        hidden_layers = Dense(self.critic_layers[0],
                              activation='relu')(input_layer)
        for size in self.critic_layers[1:]:
            hidden_layers = Dense(size, activation='relu')(hidden_layers)
        output_layer = Dense(1, activation='linear')(hidden_layers)
        self.critic = Model(inputs=[action_input, observation_input],
                            outputs=output_layer)
        self.critic.summary()
        self.action_input = action_input

    def _make_agent(self):
        """Internal helper function to create an actor-critic custom agent model
        """
        if self.actor is None:
            self._make_actor()
        if self.critic is None:
            self._make_critic()
        memory = SequentialMemory(limit=100000, window_length=1)
        random_process = OrnsteinUhlenbeckProcess(size=self.n_actions,
                                                  theta=.15,
                                                  mu=0.,
                                                  sigma=.3)
        self.agent = DDPGAgent(nb_actions=self.n_actions,
                               actor=self.actor,
                               critic=self.critic,
                               critic_action_input=self.action_input,
                               memory=memory,
                               nb_steps_warmup_critic=100,
                               nb_steps_warmup_actor=100,
                               random_process=random_process,
                               gamma=.99,
                               target_model_update=1e-3)
        self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

    def _load_or_make_agent(self):
        """Internal helper function to load an agent model, creates a new if no model weights exists
        """
        if self.agent is None:
            self._make_agent()
        if os.path.exists(self.weights_file):
            logger.info(
                "Found existing weights for the model for this environment. Loading..."
            )
            self.agent.load_weights(self.weights_file)

    def train(self):
        """Train the DDPG agent
        """
        self._load_or_make_agent()
        self.agent.fit(self.env,
                       nb_steps=50000,
                       visualize=self.visualize,
                       verbose=1,
                       nb_max_episode_steps=self.n_episodes)
        self.agent.save_weights(self.weights_file, overwrite=True)

    def test(self, nb_episodes=5):
        """Test the DDPG agent
        """
        logger.info(
            "Testing the agents with {} episodes...".format(nb_episodes))
        self.agent.test(self.env,
                        nb_episodes=nb_episodes,
                        visualize=self.visualize,
                        nb_max_episode_steps=200)
Example #33
0
class Agent:
    def __init__(self, env):
        self.nb_actions = env.action_space.shape[0]
        self.nb_states = env.observation_space.shape[0]
        self.env = env

        self.actor = self.build_actor(env)
        self.actor.compile('Adam', 'mse')
        self.critic, action_input = self.build_critic(env)
        self.loss = self.build_loss()
        self.processor = WhiteningNormalizerProcessor()

        self.memory = SequentialMemory(limit=5000000, window_length=1)
        self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions,
                                                       theta=0.75,
                                                       mu=0.5,
                                                       sigma=0.25)
        self.agent = DDPGAgent(nb_actions=self.nb_actions,
                               actor=self.actor,
                               critic=self.critic,
                               critic_action_input=action_input,
                               memory=self.memory,
                               nb_steps_warmup_critic=100,
                               nb_steps_warmup_actor=100,
                               random_process=self.random_process,
                               gamma=.99,
                               target_model_update=1e-3,
                               processor=self.processor)
        self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=self.loss)
        self.sym_actor = self.build_sym_actor()
        self.sym_actor.compile(optimizer='Adam', loss='mse')

    def build_loss(self):
        return ['mse']

    def build_actor(self, env):
        actor = Sequential()
        actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
        actor.add(Dense(64, activation='tanh'))
        actor.add(GaussianNoise(0.05))
        actor.add(Dense(64, activation='tanh'))
        actor.add(GaussianNoise(0.05))
        actor.add(Dense(self.nb_actions, activation='hard_sigmoid'))
        actor.summary()

        inD = Input(shape=(1, ) + env.observation_space.shape)
        out = actor(inD)

        return Model(inD, out)

    def build_critic(self, env):
        action_input = Input(shape=(self.nb_actions, ), name='action_input')
        observation_input = Input(shape=(1, ) + env.observation_space.shape,
                                  name='observation_input')
        flattened_observation = Flatten()(observation_input)
        x = Dense(64, activation='relu')(flattened_observation)
        x = Concatenate()([x, action_input])
        x = Dense(32, activation='relu')(x)
        x = Dense(1)(x)

        critic = Model(inputs=[action_input, observation_input], outputs=x)
        critic.summary()

        return critic, action_input

    def build_sym_actor(self):
        stateSwap = []
        actionSwap = []
        state_desc = self.env.get_state_desc()
        for x in state_desc.keys():
            keys = list(state_desc[x].keys())
            for (k, key) in enumerate(keys):
                if '_r' in key:
                    i = keys.index(key.replace('_r', '_l'))
                    if i != -1:
                        stateSwap += [(k, i), (i, k)]
        muscle_list = []
        for i in range(self.env.osim_model.muscleSet.getSize()):
            muscle_list.append(self.env.osim_model.muscleSet.get(i).getName())
        for (k, key) in enumerate(muscle_list):
            if '_r' in key:
                i = muscle_list.index(key.replace('_r', '_l'))
                if i != -1:
                    actionSwap += [(k, i), (i, k)]

        stateSwapMat = np.zeros((self.nb_states, self.nb_states))
        actionSwapMat = np.zeros((self.nb_actions, self.nb_actions))
        stateSwapMat[0, 0]
        for (i, j) in stateSwap:
            stateSwapMat[i, j] = 1
        for (i, j) in actionSwap:
            actionSwapMat[i, j] = 1

        def ssT(shape, dtype=None):
            if shape != stateSwapMat.shape:
                raise Exception("State Swap Tensor Shape Error")
            return K.variable(stateSwapMat, dtype=dtype)

        def asT(shape, dtype=None):
            if shape != actionSwapMat.shape:
                raise Exception("Action Swap Tensor Shape Error")
            return K.variable(actionSwapMat, dtype=dtype)

        model1 = Sequential()
        model1.add(
            Dense(self.nb_states,
                  input_shape=(1, ) + self.env.observation_space.shape,
                  trainable=False,
                  kernel_initializer=ssT,
                  bias_initializer='zeros'))
        inD = Input(shape=(1, ) + self.env.observation_space.shape)
        symState = model1(inD)
        symPol = self.actor(symState)
        model2 = Sequential()
        model2.add(
            Dense(self.nb_actions,
                  input_shape=(1, self.nb_actions),
                  trainable=False,
                  kernel_initializer=asT,
                  bias_initializer='zeros'))
        out = model2(symPol)

        return Model(inD, out)

    def fit(self, **kwargs):
        if 'nb_max_episode_steps' in kwargs.keys():
            self.env.spec.timestep_limit = kwargs['nb_max_episode_steps']
        else:
            self.env.spec.timestep_limit = self.env.time_limit
        out = self.agent.fit(self.env, **kwargs)
        print("\n\ndo symetric loss back propigation\n\n")
        states = np.random.normal(
            0, 10, (kwargs['nb_steps'] // 200, 1, self.nb_states))
        actions = self.actor.predict_on_batch(states)
        self.sym_actor.train_on_batch(states, actions)
        return out

    def test(self, **kwargs):
        print("testing")
        print("VA:", self.env.get_VA())
        if 'nb_max_episode_steps' in kwargs.keys():
            self.env.spec.timestep_limit = kwargs['nb_max_episode_steps']
        else:
            self.env.spec.timestep_limit = self.env.time_limit
        return self.agent.test(self.env, **kwargs)

    def test_get_steps(self, **kwargs):
        return self.test(**kwargs).history['nb_steps'][-1]

    def save_weights(self, filename='osim-rl/ddpg_{}_weights.h5f'):
        self.agent.save_weights(filename.format("opensim"), overwrite=True)
        self.save_processor()

    def load_weights(self, filename='osim-rl/ddpg_{}_weights.h5f'):
        self.agent.load_weights(filename.format("opensim"))
        self.load_processor()

    def search_VA(self):
        # 1-D line search
        state = self.env.get_VA()
        goal = 0.0
        if abs(state - goal) < 0.01:
            self.env.upd_VA(goal)
            return
        steps = self.test_get_steps(nb_episodes=1,
                                    visualize=False,
                                    nb_max_episode_steps=1000)
        dv = 0.0
        dsteps = steps
        while (state - dv > goal and dsteps > 0.8 * steps):
            dv += 0.02
            self.env.upd_VA(state - dv)
            dsteps = self.test_get_steps(nb_episodes=1,
                                         visualize=False,
                                         nb_max_episode_steps=1000)
        if abs((state - dv) - goal) < 0.01:
            self.env.upd_VA(goal)
        else:
            dv -= 0.02
            self.env.upd_VA(state - dv)

    def save_processor(self):
        np.savez('osim-rl/processor.npz',
                 _sum=self.processor.normalizer._sum,
                 _count=np.array([self.processor.normalizer._count]),
                 _sumsq=self.processor.normalizer._sumsq,
                 mean=self.processor.normalizer.mean,
                 std=self.processor.normalizer.std)

    def load_processor(self):
        f = np.load('osim-rl/processor.npz')
        dtype = f['_sum'].dtype
        if (self.processor.normalizer == None):
            self.processor.normalizer = WhiteningNormalizer(
                shape=(1, ) + self.env.observation_space.shape, dtype=dtype)
        self.processor.normalizer._sum = f['_sum']
        self.processor.normalizer._count = int(f['_count'][0])
        self.processor.normalizer._sumsq = f['_sumsq']
        self.processor.normalizer.mean = f['mean']
        self.processor.normalizer.std = f['std']
Example #34
0
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(learning_rate=.001, clipnorm=1.), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200)

# After training is done, we save the final weights.
agent.save_weights(f'ddpg_{ENV_NAME}_weights.h5f', overwrite=True)

# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
#     pos = env.arraystate2pos(state)
#     print(pos)
#     optimal_action = np.zeros(2)
#     action, optimal_action, a, b, theta = agent.test(env, nb_episodes=500000, visualize=False, nb_max_episode_steps=200, modif = True, pos = pos)
#     env.non_random_reset(pos[0], pos[1], pos[2])
#     env.render = True
#     env.step(action, rand = optimal_action, a = a, b = b, theta = theta)

# state = env.reset()
# pos = env.arraystate2pos(state)
# optimal_action = np.zeros(2)
# optimal_action[0], optimal_action[1], a, b, theta = agent.test(env, nb_episodes=500000, visualize=False, nb_max_episode_steps=200, modif = True, pos = pos)

# env.non_random_reset(pos[0], pos[1], pos[2])
# env.render = True
# env.step(optimal_action, a = a, b = b, theta = theta)
nb_test = 50
for i in range(nb_test):
    env.render = False
    state = env.reset()
    pos = env.arraystate2pos(state)
    optimal_action = np.zeros(2)
    action, optimal_action, a, b, theta = agent.test(env,
                                                     nb_episodes=500000,
                                                     visualize=False,
                                                     nb_max_episode_steps=200,
                                                     modif=True,
                                                     pos=pos)
    env.non_random_reset(pos[0], pos[1], pos[2])
    env.render = True
    env.step(action, rand=optimal_action, a=a, b=b, theta=theta)