def visualize(session_name): kwargs = {'viewer': True} ENV_NAME = 'singlePendulum-v0' env = gym.make(ENV_NAME, **kwargs) np.random.seed(7) env.seed(7) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] actor, critic, action_input = create_networks(env) memory = SequentialMemory(limit=400, window_length=1) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory) agent.compile(Adam(lr=.0005, clipnorm=1., epsilon=1.e-7, beta_1=0.9, beta_2=0.999), metrics=['mae']) checkpoint_filepath = 'checkpoint/ddpg_{}_{}_weights.h5f'.format( ENV_NAME, session_name) filepath = 'ddpg_{}_{}_weights.h5f'.format(ENV_NAME, session_name) agent.load_weights(filepath=filepath) env.viewer = True agent.test(env, nb_episodes=1, visualize=False, nb_max_episode_steps=400) env.close()
def main(args): CUDA = torch.cuda.is_available() OUTPUT_RESULTS_DIR = './saver' ENVIRONMENT = 'SemisuperPendulumRandom-v0' TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S") SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "DDPG", ENVIRONMENT, TIMESTAMP) env = gym.make(ENVIRONMENT) env = wrappers.Monitor(env, SUMMARY_DIR, force=True) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high actor = ActorNetwork(state_dim, action_dim, action_bound, args.actor_lr, args.tau, args.seed) target_actor = ActorNetwork(state_dim, action_dim, action_bound, args.actor_lr, args.tau, args.seed) critic = CriticNetwork(state_dim, action_dim, action_bound, args.critic_lr, args.tau, args.l2_decay, args.seed) target_critic = CriticNetwork(state_dim, action_dim, action_bound, args.critic_lr, args.tau, args.l2_decay, args.seed) if CUDA: actor = actor.cuda() target_actor = target_actor.cuda() critic = critic.cuda() target_critic = target_critic.cuda() replay_buffer = ReplayBuffer(args.bufferlength, args.seed) agent = DDPGAgent(actor, target_actor, critic, target_critic, replay_buffer, batch_size=args.batch_size, gamma=args.gamma, seed=args.seed, episode_len=args.episode_len, episode_steps=args.episode_steps, noise_mean=args.noise_mean, noise_th=args.noise_th, noise_std=args.noise_std, noise_decay=args.noise_decay) if args.is_train: agent.train(env) agent.save_actor_weights(save_dir=OUTPUT_RESULTS_DIR, filename=args.actor_weights) else: agent.load_actor_weights(save_dir=OUTPUT_RESULTS_DIR, filename=args.actor_weights) agent.test(env)
def test_ddpg(): # TODO: replace this with a simpler environment where we can actually test if it finds a solution env = gym.make('Pendulum-v0') np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.shape[0] actor = Sequential() actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)]) agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
class DDPG(): def __init__(self, Env): self.env = Env nb_actions = self.env.action_space.shape[0] actor = Sequential() actor.add(Flatten(input_shape=(1,) + self.env.observation_space.shape)) actor.add(Dense(5)) actor.add(Activation('relu')) actor.add(Dense(8)) actor.add(Activation('relu')) actor.add(Dense(5)) actor.add(Activation('relu')) # actor.add(Dense(16)) # actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('softmax')) # print(actor.summary()) action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + Env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation], name = 'concatenate') x = Dense(5)(x) x = Activation('relu')(x) x = Dense(8)(x) x = Activation('relu')(x) x = Dense(5)(x) x = Activation('relu')(x) # x = Dense(32)(x) # x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) memory = SequentialMemory(limit=100000, window_length=1) # random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) random_process = None self.agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=32, nb_steps_warmup_actor=32, random_process=random_process, gamma=0, target_model_update=0.001) self.agent.processor = ShowActionProcessor(self.agent, self.env) self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) def fit(self): history = self.agent.fit(self.env, action_repetition=1, nb_steps=20000, visualize=False, verbose=1, nb_max_episode_steps=10) return history def save_weights(self): self.agent.save_weights('./store/ddpg_{}_weights2.h5f'.format("porfolio"), overwrite=True) def test(self): history = self.agent.test(self.env, nb_episodes=1, visualize=False, nb_max_episode_steps=10) return history def load_weights(self): self.agent.load_weights('./store/ddpg_{}_weights2.h5f'.format("porfolio"))
def _train(self): env = CrazyflieEnvironment(self._cf) atexit.register(teardown_env, env, self._cf) np.random.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # Next, we build a very simple model. actor = self.actor_model(env, nb_actions) action_input, critic = self.critic_model(env, nb_actions) memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) model_name = 'ddpg_{}_weights.h5f'.format('crazyflie') agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) if os.path.exists(model_name): agent.load_weights(model_name) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) try: agent.fit(env, nb_steps=50000, verbose=2) agent.test(env, nb_episodes=1) finally: agent.save_weights(model_name, overwrite=True)
def test_ddpg(): # TODO: replace this with a simpler environment where we can actually test if it finds a solution env = gym.make('Pendulum-v0') np.random.seed(123) env.seed(123) random.seed(123) nb_actions = env.action_space.shape[0] actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(16)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=50, nb_steps_warmup_actor=50, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile([Adam(lr=1e-3), Adam(lr=1e-3)]) agent.fit(env, nb_steps=400, visualize=False, verbose=0, nb_max_episode_steps=100) h = agent.test(env, nb_episodes=2, visualize=False, nb_max_episode_steps=100)
x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) plot_model(critic, to_file='critic.png', show_shapes=True) # # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # # even the metrics! memory = SequentialMemory(limit=10000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=0.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) # # Okay, now it's time to learn something! We visualize the training here for show, but this # # slows down training quite a lot. You can always safely abort the training prematurely using # # Ctrl + C. agent.fit(env, nb_steps=25000, visualize=False, verbose=1, nb_max_episode_steps=200) # # After training is done, we save the final weights. agent.save_weights('ddpg_stokes_weights.h5f', overwrite=True) # # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, nb_steps_warmup_actor=2048, nb_steps_warmup_critic=1024, target_model_update=1000, gamma=0.95, batch_size=128, memory_interval=1) agent.compile((Adam(lr=1e-6), Adam(lr=1e-4)), metrics=['mae']) # Start training for 7.5M simulation steps (1.5M training steps with actions repeated 5 times) agent.fit(env, nb_steps=1500000, visualize=True, action_repetition=1, verbose=1, nb_max_start_steps=0, nb_max_episode_steps=10000, log_interval=10000, callbacks=[]) # Test the agent hist = agent.test(env, nb_episodes=10, action_repetition=1, nb_max_episode_steps=5000, visualize=True)
def main_function(args, data): #### INITIALISATION DES CONSTANTES ##### ## Model ## SIZE_HIDDEN_LAYER_ACTOR = data['SIZE_HIDDEN_LAYER_ACTOR'][0] LR_ACTOR = data['LR_ACTOR'][0] SIZE_HIDDEN_LAYER_CRITIC = data['SIZE_HIDDEN_LAYER_CRITIC'][0] LR_CRITIC = data['LR_CRITIC'][0] DISC_FACT = data['DISC_FACT'][0] TARGET_MODEL_UPDATE = data['TARGET_MODEL_UPDATE'][0] BATCH_SIZE = data['BATCH_SIZE'][0] REPLAY_BUFFER_SIZE = data['REPLAY_BUFFER_SIZE'][0] ## Exploration ## THETA = data['THETA'][0] SIGMA = data['SIGMA'][0] SIGMA_MIN = data['SIGMA_MIN'][0] N_STEPS_ANNEALING = data['N_STEPS_ANNEALING'][0] ## Acceleration ## ACTION_REPETITION = data['ACTION_REPETITION'][0] INTEGRATOR_ACCURACY = data['INTEGRATOR_ACCURACY'][0] # # Simulation ## N_STEPS_TRAIN = int(args.step) N_EPISODE_TEST = 100 if args.visualize: N_EPISODE_TEST = 3 VERBOSE = 1 # 0: pas de descriptif # 1: descriptif toutes les LOG_INTERVAL steps # 2: descriptif à chaque épisode LOG_INTERVAL = 500 # Save weights ## if not os.path.exists('weights'): os.mkdir('weights') print("Directory ", 'weights', " Created ") FILES_WEIGHTS_NETWORKS = './weights/' + args.model + '.h5f' # #### CHARGEMENT DE L'ENVIRONNEMENT ##### if args.prosthetic: env = ProsContinueRewardWrapper( ProstheticsEnv(visualize=args.visualize, integrator_accuracy=INTEGRATOR_ACCURACY)) if not args.prosthetic: env = CustomDoneOsimWrapper( CustomRewardWrapper( RelativeMassCenterObservationWrapper( NoObstacleObservationWrapper( L2RunEnv(visualize=args.visualize, integrator_accuracy=0.005))))) env.reset() # Examine the action space ## action_size = env.action_space.shape[0] #action_size = int(env.action_space.shape[0]/2) pour la symmétrie print('Size of each action:', action_size) # Examine the state space ## state_size = env.observation_space.shape[0] print('Size of state:', state_size) # #### ACTOR / CRITIC ##### # Actor (mu) ## if args.prosthetic: input_shape = (1, env.observation_space.shape[0]) if not args.prosthetic: input_shape = (1, env.observation_space.shape[0]) observation_input = Input(shape=input_shape, name='observation_input') x = Flatten()(observation_input) x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_ACTOR)(x) x = Activation('relu')(x) x = Dense(action_size)(x) x = Activation('sigmoid')(x) actor = Model(inputs=observation_input, outputs=x) opti_actor = Adam(lr=LR_ACTOR) # Critic (Q) ## action_input = Input(shape=(action_size, ), name='action_input') x = Flatten()(observation_input) x = concatenate([action_input, x]) x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x) x = Activation('relu')(x) x = Dense(SIZE_HIDDEN_LAYER_CRITIC)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) opti_critic = Adam(lr=LR_CRITIC) # #### SET UP THE AGENT ##### # Initialize Replay Buffer ## memory = SequentialMemory(limit=REPLAY_BUFFER_SIZE, window_length=1) # Random process (exploration) ## random_process = OrnsteinUhlenbeckProcess( theta=THETA, mu=0, sigma=SIGMA, sigma_min=SIGMA_MIN, size=action_size, n_steps_annealing=N_STEPS_ANNEALING) # random_process_l = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN, # size=action_size, n_steps_annealing=N_STEPS_ANNEALING) # random_process_r = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA,sigma_min= SIGMA_MIN, # size=action_size, n_steps_annealing=N_STEPS_ANNEALING) # Paramètres agent DDPG ## # agent = SymmetricDDPGAgent(nb_actions=action_size, actor=actor, critic=critic, # critic_action_input=action_input, # memory=memory, random_process_l=random_process_l, random_process_r=random_process_r, # gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE, # batch_size=BATCH_SIZE) agent = DDPGAgent(nb_actions=action_size, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE, batch_size=BATCH_SIZE) agent.compile(optimizer=[opti_critic, opti_actor]) # #### TRAIN ##### logdir = "keras_logs/" + datetime.now().strftime("%Y-%m-%d_%H.%M.%S") robustensorboard = RobustTensorBoard(log_dir=logdir, hyperparams=data) saveBest = SaveBestEpisode() if args.train: if args.resume: agent.load_weights(FILES_WEIGHTS_NETWORKS) else: check_overwrite(args.model) agent.fit(env, nb_steps=N_STEPS_TRAIN, visualize=args.visualize, verbose=VERBOSE, log_interval=LOG_INTERVAL, callbacks=[robustensorboard, saveBest], action_repetition=ACTION_REPETITION) agent.save_weights(FILES_WEIGHTS_NETWORKS, overwrite=True) #### TEST ##### if not args.train: agent.load_weights(FILES_WEIGHTS_NETWORKS) agent.test(env, nb_episodes=N_EPISODE_TEST, visualize=args.visualize)
''' # agent.load_weights('fit-weights.h5f') ''' fit ''' history = agent.learning(env, policy, policy_list, nb_steps=1e7, visualize=False, log_interval=1000, verbose=1, nb_max_episode_steps=4000, imitation_leaning_time=1e4, reinforcement_learning_time=9e4) # plt.plot(history.history['metrics']) # plt.plot(history.history['reward']) # plt.show() sio.savemat(ENV_NAME + '-' + nowtime + '/fit.mat', history.history) # After training is done, we save the final weights. agent.save_weights(ENV_NAME + '-' + nowtime + '/fit-weights.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes. history = agent.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=5000) sio.savemat(ENV_NAME + '-' + nowtime + '/test-final.mat', history.history)
## Initialize Replay Buffer ## memory = SequentialMemory(limit=REPLAY_BUFFER_SIZE, window_length=1) # window_length : usefull for Atari game (cb d'images d'affilé on veut analysé (vitesse de la balle, etc..)) ## Random process (exploration) ## random_process = OrnsteinUhlenbeckProcess(theta=THETA, mu=MEAN, sigma=SIGMA, size=action_size) ## Paramètres agent DDPG ## agent = DDPGAgent(nb_actions=action_size, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE, batch_size= BATCH_SIZE) agent.compile(optimizer = [opti_critic, opti_actor], metrics= ['mae']) ##### TRAIN ##### if args.train: check_overwrite(args.model) history = agent.fit(env, nb_steps=N_STEPS_TRAIN, visualize=args.visualize, verbose=VERBOSE, log_interval = LOG_INTERVAL) agent.save_weights(FILES_WEIGHTS_NETWORKS, overwrite=True) save_plot_reward(history, args.model, params) ##### TEST ##### if not args.train : agent.load_weights(FILES_WEIGHTS_NETWORKS) history = agent.test(env, nb_episodes=N_EPISODE_TEST, visualize=args.visualize) save_result(history, args.model, params)
mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=50000, visualize=False, verbose=1, log_interval=50, nb_max_episode_steps=None) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) #agent.load_weights('ddpg_Reacher-v2_weights_128.h5f') # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=30, visualize=True, nb_max_episode_steps=None)
random_process=random_process, gamma=GAMMA, target_model_update=1e-4) # compile the model agent.compile(Adam(lr=1e-3, clipnorm=1.), metrics=['mse']) callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre, filename_exp) # ---------------------------------------------------------------------------------------------------------------------------------------- # Training phase # fitting the agent # agent.fit(env, nb_steps=3000000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=STEPS_PER_EPISODE,process_noise_std=process_noise_std) # After training is done, we save the final weights. # agent.save_weights(log_filename_pre+filename_exp+'/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # common_func.save_process_noise(ENV_NAME, log_filename_pre, filename_exp, process_noise_std, theta) #--------------------------------------------------------------------------------------------------------------------------------------- # Testing phase agent.load_weights(log_filename_pre + filename_exp + '/ddpg_{}_weights.h5f'.format(ENV_NAME)) # # Finally, evaluate our algorithm. history, state_history_nominal, episode_reward_nominal, action_history = agent.test(env, nb_episodes=1, visualize=True, action_repetition=1, \ nb_max_episode_steps=STEPS_PER_EPISODE, initial_state=np.array([0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]), \ std_dev_noise=0, gamma=GAMMA) # print(episode_reward_nominal, state_history_nominal) # -----------------------------------------------------------------------------------------------------------------------------------------
# slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. if args.train: agent.load_weights('aviral_jump_new.h5f') print 'weights loaded' agent.fit(env, nb_steps=nallsteps, visualize=True, verbose=1, nb_max_episode_steps=1000, log_interval=1000) print 'TRAINED THE MODELS' # After training is done, we save the final weights. agent.save_weights(args.model, overwrite=True) if not args.train: print args.model agent.load_weights(args.model) # sys.exit(0) # Finally, evaluate our algorithm for 1 episode. h = Histories() agent.test(env, nb_episodes=10, visualize=False, nb_max_episode_steps=1000, action_repetition=2, callbacks=[h]) # print h.action_list f = open('values_jump_new.txt', 'w') # f.write(str(h.action_list) pickle.dump(h.action_dict_list, f) f.close() print("done pickling") # for i in range(600): # ac = agent.forward(obs) # f.write(str(ac)) # f.write('\n\n\n') # obs, rew, _, _ = env.step(ac) # f.close()
x = Activation('relu')(x) # Output Layer x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=2*NUM_STEPS, window_length=1) # random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, dt = env.tau, theta=0.6, mu=0.0, sigma=0.5, sigma_min=0.15, n_steps_annealing=NUM_STEPS) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.999, target_model_update=1e-3, delta_clip=1.0) agent.compile(Adam(lr=.001, clipnorm=1.0), metrics=['mae']) # Load the model weights - this method will automatically load the weights for # both the actor and critic agent.load_weights(FILENAME) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True,action_repetition=5) #nb_max_episode_steps=500,
x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Set up the agent for training memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1.) # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. if args.train: agent.fit(env, nb_steps=nallsteps, visualize=False, verbose=1, nb_max_episode_steps=env.timestep_limit, log_interval=10000) # After training is done, we save the final weights. agent.save_weights(args.model, overwrite=True) if not args.train: agent.load_weights(args.model) # Finally, evaluate our algorithm for 1 episode. agent.test(env, nb_episodes=1, visualize=False, nb_max_episode_steps=500)
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) # print(actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) if REWARD == "normal": ddpg_normal = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) ddpg_normal.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. history_normal = ddpg_normal.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=200) # After training is done, we save the final weights. ddpg_normal.save_weights(os.path.join( LOG_DIR, 'ddpg_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. ddpg_normal.test(env, nb_episodes=5, visualize=False, verbose=2, nb_max_episode_steps=200) pandas.DataFrame(history_normal.history).to_csv( os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": processor_noisy = PendulumSurrogateProcessor(weight=WEIGHT, surrogate=False, noise_type=NOISE_TYPE) ddpg_noisy = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=processor_noisy) ddpg_noisy.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) history_noisy = ddpg_noisy.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=200) ddpg_noisy.save_weights(os.path.join( LOG_DIR, 'ddpg_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) ddpg_noisy.test(env, nb_episodes=5, visualize=False, verbose=2, nb_max_episode_steps=200) pandas.DataFrame(history_noisy.history).to_csv( os.path.join(LOG_DIR, "noisy.csv")) elif REWARD == "surrogate": processor_surrogate = PendulumSurrogateProcessor(weight=WEIGHT, surrogate=True, noise_type=NOISE_TYPE) ddpg_surrogate = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=processor_surrogate) ddpg_surrogate.compile(Adam(lr=.0005, clipnorm=1.), metrics=['mae']) history_surrogate = ddpg_surrogate.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=200) ddpg_surrogate.save_weights(os.path.join( LOG_DIR, 'ddpg_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) ddpg_surrogate.test(env, nb_episodes=5, visualize=False, verbose=2, nb_max_episode_steps=200) pandas.DataFrame(history_surrogate.history).to_csv( os.path.join(LOG_DIR, "surrogate.csv")) else: raise NotImplementedError
def evaluate_model(model_path=None, interactive=False, seed=12345): np.random.seed(seed) actor, critic, action_input = define_actor_critic_models(actions=3) memory = SequentialMemory(limit=10000, window_length=1) random_process = GaussianWhiteNoiseProcess(mu=0, sigma=0, sigma_min=0, n_steps_annealing=1) agent = DDPGAgent(nb_actions=3, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=500, nb_steps_warmup_actor=100, random_process=random_process, gamma=.95, target_model_update=0.0001, batch_size=32) agent.compile([RMSprop(lr=.0001), RMSprop(lr=.01)], metrics=['mae']) if model_path is not None: agent.load_weights(model_path) # Train Evaluation env = CameraControlEnvCont(dataset_pickle_path='data/dataset.pickle', testing=False, interactive=interactive) env.seed(seed) res = agent.test(env, nb_episodes=500, nb_max_episode_steps=100, verbose=0, visualize=False) train_mean_reward = np.mean(res.history['episode_reward']) before_train_position_error = np.mean( np.abs(env.init_position_error_pixels)) before_train_zoom_error = np.mean(np.abs(env.init_zoom_error_pixels)) after_train_position_error = np.mean( np.abs(env.final_position_error_pixels)) after_train_zoom_error = np.mean(np.abs(env.final_zoom_error_pixels)) print("Training evaluation: ") print("Mean reward: ", train_mean_reward) print("Position: ", before_train_position_error, " -> ", after_train_position_error) print("Zoom: ", before_train_zoom_error, " -> ", after_train_zoom_error) # Test Evaluation env = CameraControlEnvCont(dataset_pickle_path='data/dataset.pickle', testing=True, interactive=interactive) env.seed(seed) res = agent.test(env, nb_episodes=500, nb_max_episode_steps=100, verbose=0, visualize=False) train_mean_reward = np.mean(res.history['episode_reward']) before_train_position_error = np.mean( np.abs(env.init_position_error_pixels)) before_train_zoom_error = np.mean(np.abs(env.init_zoom_error_pixels)) after_train_position_error = np.mean( np.abs(env.final_position_error_pixels)) after_train_zoom_error = np.mean(np.abs(env.final_zoom_error_pixels)) print("Testing evaluation: ") print("Mean reward: ", train_mean_reward) print("Position: ", before_train_position_error, " -> ", after_train_position_error) print("Zoom: ", before_train_zoom_error, " -> ", after_train_zoom_error)
env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] n = DroneNetwork(nb_actions=nb_actions, observation_shape=env.observation_space.shape) # Next, we build a very simple model. actor = n.create_actor() critic = n.create_critic() action_input = n.get_action_input() actor.summary() critic.summary() print(action_input) memory = SequentialMemory(limit=100000, window_length=1) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) agent.load_weights('ddpg_{}_weights.h5f'.format('drone')) agent.test(env, nb_episodes=100000, visualize=True) #agent.test(env, nb_episodes=20, visualize=True, nb_max_episode_steps=50) env.close()
x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Set up the agent for training memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1.) # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. if args.train: agent.fit(env, nb_steps=nallsteps, visualize=False, verbose=1, nb_max_episode_steps=200, log_interval=10000) # After training is done, we save the final weights. agent.save_weights(args.model, overwrite=True) if not args.train: agent.load_weights(args.model) # Finally, evaluate our algorithm for 1 episode. agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=1000)
sigma=0.8, sigma_min=0.05, n_steps_annealing=650000) # Create the agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, nb_steps_warmup_actor=32, nb_steps_warmup_critic=32, target_model_update=1e-4, gamma=0.9, batch_size=32) agent.compile(Adam(lr=1e-4), metrics=['mae']) # Start training for 7.5M simulation steps (1.5M training steps with actions repeated 5 times) agent.fit(env, nb_steps=1500000, visualize=False, action_repetition=5, verbose=2, nb_max_start_steps=0, log_interval=10000, callbacks=[]) # Test the agent hist = agent.test(env, nb_episodes=10, action_repetition=1, visualize=True)
callbacks = build_callbacks(ENV_NAME) test_callbacks = build_test_callbacks(ENV_NAME) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. #agent.fit(env, nb_steps=500000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=30) # After training is done, we save the final weights. #agent.save_weights('results/InvertedPendulum/exp_6/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) agent.load_weights( 'results/InvertedPendulum/exp_6/ddpg_{}_weights.h5f'.format(ENV_NAME)) # Finally, evaluate our algorithm for 5 episodes. history, state_history_nominal, episode_reward_nominal = agent.test(env, nb_episodes=1, visualize=True, action_repetition=1, callbacks=test_callbacks, nb_max_episode_steps=30, \ initial_state=[0, np.pi, 0, 0], std_dev_noise=0, gamma=GAMMA) u_max = 12 print(episode_reward_nominal, state_history_nominal) ''' f = open("results/InvertedPendulum/exp_3/data.txt", "a") for i in frange(0.0, 1.0, 0.05): episode_reward_n = 0 Var_n = 0 terminal_mse = 0 Var_terminal_mse = 0 for j in range(n_samples): history, state_history, episode_reward = agent.test(env, nb_episodes=1, visualize=False, action_repetition=1, nb_max_episode_steps=30, initial_state=[0, np.pi, 0, 0], std_dev_noise=i*u_max, gamma=GAMMA) episode_reward_n += episode_reward Var_n += (episode_reward)**2
logger.info('Iteration #{}'.format(n)) #train train_history = agent.fit(env, nb_steps=nb_stepis, visualize=False, verbose=1, nb_max_episode_steps=nb_stepis) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_nomad_v3_weights.h5f'.format(ENV_NAME), overwrite=True) # Save memory pickle.dump(memory, open("memory2.pkl", "wb")) # Finally, evaluate our algorithm for nb_episodes episodes. test_history = agent.test(env, nb_episodes=nb_episodes, visualize=False, nb_max_episode_steps=nb_stepis) #loading weights and model and logging taken from: #https://github.com/olavt/gym_co2_ventilation/blob/master/examples/test_keras_rl_continious.py train_rewards = train_history.history['episode_reward'] test_rewards = test_history.history['episode_reward'] for i in range(0, nb_episodes): episode_logger.info('{},{},{}'.format(((n - 1) * nb_episodes + i + 1), train_rewards[i], test_rewards[i]))
class KerasDDPGAgent(object): ''' classdocs ''' def __init__(self, opts): self.metadata = {'discrete_actions': False} self.opts = opts def configure(self, observation_space_shape, nb_actions): # Next, we build a simple model. # actor network actor = Sequential() actor.add(Flatten(input_shape=(1, ) + observation_space_shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) print(actor.summary()) # critic network action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + observation_space_shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) self.agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) def train(self, env, nb_steps, visualize, verbosity): # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. self.agent.fit(env, nb_steps=nb_steps, visualize=visualize, verbose=verbosity, nb_max_episode_steps=200) def test(self, env, nb_episodes, visualize): # Finally, evaluate our algorithm for 5 episodes. self.agent.test(env, nb_episodes=nb_episodes, visualize=visualize, nb_max_episode_steps=200) def load_weights(self, load_file): self.agent.load_weights(load_file) def save_weights(self, save_file, overwrite): self.agent.save_weights(save_file, overwrite=True)
callbacks = [] checkpoint_weights_filename = 'weights/ddpg_{}_checkpointWeights_{{step}}_{}_{}_{}_{}.h5f'.format( ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) log_filename = 'logs/ddpg_{}_log_{}_{}_{}_{}.json'.format( ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) #callbacks += [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)] callbacks += [FileLogger(log_filename, interval=100)] # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=NUM_STEPS, callbacks=callbacks, visualize=False, verbose=1) #, nb_max_episode_steps=500) # After training is done, we save the final weights. filename = 'weights/ddpg_{}_weights_{}_{}_{}_{}.h5f'.format( ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) agent.save_weights(filename, overwrite=True) # We'll also save a simply named version to make running test immediately # following training easier. filename = 'weights/ddpg_{}_weights.h5f'.format(ENV_NAME) agent.save_weights(filename, overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, visualize=True) #nb_max_episode_steps=500,
else: agent.load_weights('/home/bdb3m/swmm_rl/agent_weights/ddpg_swmm_weights.h5f') agent.fit(env, nb_steps=train_steps, verbose=0) agent.save_weights('/home/bdb3m/swmm_rl/agent_weights/ddpg_swmm_weights.h5f', overwrite=True) env.close() if file_num % 10 == 0: print("finished training on ", file_num, " files") file_num += 1 # loop through testing envs for file in os.scandir("/home/bdb3m/swmm_rl/syn_inp_test"): if file.name.endswith('.inp'): print('testing ', file.name) env = BasicEnv(inp_file=file.path, depth=depth) history = agent.test(env, nb_episodes=1, visualize=False, nb_max_start_steps=0) env.close() # get rain/tide data from inp file rain_str = [] tide_str = [] with open(file.path, 'r') as tmp_file: lines = tmp_file.readlines() for i, l in enumerate(lines): if l.startswith("[TIMESERIES]"): # find time series section start = i + 3 for i, l in enumerate(lines[start:]): if l.startswith('Atlas14'): rain_str.append(l) if l.startswith('Tide1'): tide_str.append(l)
# Create Actor and Critic networks k.clear_session() actor = get_actor(obs_n, actions_n) critic, action_input = get_critic(obs_n, actions_n) print(actor.summary()) print(critic.summary()) memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=actions_n, theta=.15, mu=0., sigma=.1) agent = DDPGAgent(nb_actions=actions_n[0], actor=actor, critic=critic, batch_size=64, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99) agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mse']) #agent.load_weights('ddpg_' + ENV_NAME + 'weights.h5f') agent.fit(env, env_name=ENV_NAME, nb_steps=500000, action_repetition=5, visualize=False, verbose=1) env = wrappers.Monitor(env,'/home/wolfie/PycharmProjects/pythonProject/ddpg_halfcheetah', video_callable=lambda episode_id: True, force=True) agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=1000, verbose=1) p.disconnect()
# random process for exploration noise random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=theta, dt=0.01, mu=0., sigma=.25) # define the DDPG agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=GAMMA, target_model_update=1e-3) # compile the model agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mse']) callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre, filename_exp) # ---------------------------------------------------------------------------------------------------------------------------------------- # Training phase # fitting the agent #agent.fit(env, nb_steps=800000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=900) # After training is done, we save the final weights. #agent.save_weights('../results/Swimmer6/exp_1/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # ----------------------------------------------------------------------------------------------------------------------------------------- # Testing phase agent.load_weights(log_filename_pre+filename_exp +'/ddpg_{}_weights.h5f'.format(ENV_NAME)) history, state_history_nominal, episode_reward_nominal, action_history = agent.test(env, nb_episodes=1, visualize=True, action_repetition=1, nb_max_episode_steps=STEPS_PER_EPISODE, \ initial_state=np.zeros((16,)), std_dev_noise=20, gamma=GAMMA, process_noise_std=process_noise_std) # np.savetxt(log_filename_pre+filename_exp+'/s3_nominal_action.txt', action_history) # np.savetxt(log_filename_pre+filename_exp+'/s3_nominal_state.txt', state_history_nominal) print(state_history_nominal,action_history) # -----------------------------------------------------------------------------------------------------------------------------------------
def run_ddpg(): global N_NODE_NETWORK env = SnakeGymContinuous() assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # initialize randomness np.random.seed(123) env.seed(123) # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) actor.add(Dense(N_NODE_NETWORK)) actor.add(Activation('relu')) actor.add(Dense(N_NODE_NETWORK)) actor.add(Activation('relu')) actor.add(Dense(N_NODE_NETWORK)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) print(actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(N_NODE_NETWORK * 2)(x) x = Activation('relu')(x) x = Dense(N_NODE_NETWORK * 2)(x) x = Activation('relu')(x) x = Dense(N_NODE_NETWORK * 2)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=500, nb_steps_warmup_actor=500, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile('adam', metrics=['mae']) agent.fit(env, nb_steps=50000, visualize=True, verbose=2, nb_max_episode_steps=200) agent.save_weights('ddpg_SnakeGymContinuous_weights.h5f', overwrite=True) agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
# Optionally, we can reload a previous model's weights and continue training from there # Remove the _actor or _critic from the filename. The load method automatically # appends these. WEIGHTS_FILENAME = 'weights/ddpg_planar_crane_continuous-v0_weights.h5f' # agent.load_weights(WEIGHTS_FILENAME) callbacks = [] checkpoint_weights_filename = 'weights/ddpg_{}_checkpointWeights_{{step}}_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) log_filename = 'logs/ddpg_{}_log_{}_{}_{}_{}.json'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) #callbacks += [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=100000)] callbacks += [FileLogger(log_filename, interval=100)] # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=NUM_STEPS, callbacks=callbacks, visualize=False, verbose=1)#, nb_max_episode_steps=500) # After training is done, we save the final weights. filename = 'weights/ddpg_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) agent.save_weights(filename, overwrite=True) # We'll also save a simply named version to make running test immediately # following training easier. filename = 'weights/ddpg_{}_weights.h5f'.format(ENV_NAME) agent.save_weights(filename, overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, visualize=True) #nb_max_episode_steps=500,
def train_with_params(sigma_v = 0., sigma_o = 0.,test=False): ENV_NAME = 'PongSolo' conf_name = '{}_sv_{}_so_{}'.format(ENV_NAME,sigma_v,sigma_o) # sv, so = sigma_v et sigma_orientation # Get the environment and extract the number of actions. env = EnvPongSolo(sigma_v = sigma_v, sigma_o = sigma_v) np.random.seed(123) #assert len(env.action_space.shape) == 1 nb_actions = 1 leaky_alpha = 0.2 # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) actor.add(Dense(100)) actor.add(LeakyReLU(leaky_alpha)) actor.add(Dense(nb_actions)) actor.add(Activation('linear')) print(actor.summary()) action_input = Input(shape=(nb_actions,), name='action_input') observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = merge([action_input, flattened_observation], mode='concat') x = Dense(200)(x) x = LeakyReLU(leaky_alpha)(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) n_steps = 5000000 random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=1., mu=0., sigma=.3, sigma_min=0.01, n_steps_annealing=n_steps) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. directory_weights = "weights/ddpg/{}".format(conf_name) if not os.path.exists(directory_weights): os.makedirs(directory_weights) if test == False: perfCheckPoint = ModelPerformanceCheckpoint('{}/checkpoint_avg{}_steps{}'.format(directory_weights,'{}','{}'), 800) agent.fit(env, nb_steps=n_steps, visualize=False, verbose=2, nb_max_episode_steps=200,callbacks=[perfCheckPoint]) # After training is done, we save the final weights. agent.save_weights('{}/final.h5f'.format(directory_weights), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=100, visualize=False, nb_max_episode_steps=200) else: agent.load_weights('{}/final.h5f'.format(directory_weights)) agent.test(env, nb_episodes=1000, visualize=False, nb_max_episode_steps=200)
class DDPG: """Deep Deterministic Policy Gradient Class This is an implementation of DDPG for continuous control tasks made using the high level keras-rl library. Args: env_name (str): Name of the gym environment weights_dir (str): Dir for storing model weights (for both actors and critic as separate files) actor_layers (list(int)): A list of int representing neurons in each subsequent the hidden layer in actor critic_layers (list(int)): A list of int representing neurons in each subsequent the hidden layer in actor n_episodes (int): Maximum training eprisodes visualize (bool): Whether a popup window with the environment view is required """ def __init__(self, env_name='MountainCarContinuous-v0', weights_dir="model_weights", actor_layers=[64, 64, 32], critic_layers=[128, 128, 64], n_episodes=200, visualize=True): self.env_name = env_name self.env = gym.make(env_name) np.random.seed(123) self.env.seed(123) self.actor_layers = actor_layers self.critic_layers = critic_layers self.n_episodes = n_episodes self.visualize = visualize self.n_actions = self.env.action_space.shape[0] self.n_states = self.env.observation_space.shape self.weights_file = os.path.join( weights_dir, 'ddpg_{}_weights.h5f'.format(self.env_name)) self.actor = None self.critic = None self.agent = None self.action_input = None def _make_actor(self): """Internal helper function to create an actor custom model """ self.actor = Sequential() self.actor.add(Flatten(input_shape=(1, ) + self.n_states)) for size in self.actor_layers: self.actor.add(Dense(size, activation='relu')) self.actor.add(Dense(self.n_actions, activation='linear')) self.actor.summary() def _make_critic(self): """Internal helper function to create an actor custom model """ action_input = Input(shape=(self.n_actions, ), name='action_input') observation_input = Input(shape=(1, ) + self.n_states, name='observation_input') flattened_observation = Flatten()(observation_input) input_layer = Concatenate()([action_input, flattened_observation]) hidden_layers = Dense(self.critic_layers[0], activation='relu')(input_layer) for size in self.critic_layers[1:]: hidden_layers = Dense(size, activation='relu')(hidden_layers) output_layer = Dense(1, activation='linear')(hidden_layers) self.critic = Model(inputs=[action_input, observation_input], outputs=output_layer) self.critic.summary() self.action_input = action_input def _make_agent(self): """Internal helper function to create an actor-critic custom agent model """ if self.actor is None: self._make_actor() if self.critic is None: self._make_critic() memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=self.n_actions, theta=.15, mu=0., sigma=.3) self.agent = DDPGAgent(nb_actions=self.n_actions, actor=self.actor, critic=self.critic, critic_action_input=self.action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) def _load_or_make_agent(self): """Internal helper function to load an agent model, creates a new if no model weights exists """ if self.agent is None: self._make_agent() if os.path.exists(self.weights_file): logger.info( "Found existing weights for the model for this environment. Loading..." ) self.agent.load_weights(self.weights_file) def train(self): """Train the DDPG agent """ self._load_or_make_agent() self.agent.fit(self.env, nb_steps=50000, visualize=self.visualize, verbose=1, nb_max_episode_steps=self.n_episodes) self.agent.save_weights(self.weights_file, overwrite=True) def test(self, nb_episodes=5): """Test the DDPG agent """ logger.info( "Testing the agents with {} episodes...".format(nb_episodes)) self.agent.test(self.env, nb_episodes=nb_episodes, visualize=self.visualize, nb_max_episode_steps=200)
class Agent: def __init__(self, env): self.nb_actions = env.action_space.shape[0] self.nb_states = env.observation_space.shape[0] self.env = env self.actor = self.build_actor(env) self.actor.compile('Adam', 'mse') self.critic, action_input = self.build_critic(env) self.loss = self.build_loss() self.processor = WhiteningNormalizerProcessor() self.memory = SequentialMemory(limit=5000000, window_length=1) self.random_process = OrnsteinUhlenbeckProcess(size=self.nb_actions, theta=0.75, mu=0.5, sigma=0.25) self.agent = DDPGAgent(nb_actions=self.nb_actions, actor=self.actor, critic=self.critic, critic_action_input=action_input, memory=self.memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=self.random_process, gamma=.99, target_model_update=1e-3, processor=self.processor) self.agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=self.loss) self.sym_actor = self.build_sym_actor() self.sym_actor.compile(optimizer='Adam', loss='mse') def build_loss(self): return ['mse'] def build_actor(self, env): actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) actor.add(Dense(64, activation='tanh')) actor.add(GaussianNoise(0.05)) actor.add(Dense(64, activation='tanh')) actor.add(GaussianNoise(0.05)) actor.add(Dense(self.nb_actions, activation='hard_sigmoid')) actor.summary() inD = Input(shape=(1, ) + env.observation_space.shape) out = actor(inD) return Model(inD, out) def build_critic(self, env): action_input = Input(shape=(self.nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Dense(64, activation='relu')(flattened_observation) x = Concatenate()([x, action_input]) x = Dense(32, activation='relu')(x) x = Dense(1)(x) critic = Model(inputs=[action_input, observation_input], outputs=x) critic.summary() return critic, action_input def build_sym_actor(self): stateSwap = [] actionSwap = [] state_desc = self.env.get_state_desc() for x in state_desc.keys(): keys = list(state_desc[x].keys()) for (k, key) in enumerate(keys): if '_r' in key: i = keys.index(key.replace('_r', '_l')) if i != -1: stateSwap += [(k, i), (i, k)] muscle_list = [] for i in range(self.env.osim_model.muscleSet.getSize()): muscle_list.append(self.env.osim_model.muscleSet.get(i).getName()) for (k, key) in enumerate(muscle_list): if '_r' in key: i = muscle_list.index(key.replace('_r', '_l')) if i != -1: actionSwap += [(k, i), (i, k)] stateSwapMat = np.zeros((self.nb_states, self.nb_states)) actionSwapMat = np.zeros((self.nb_actions, self.nb_actions)) stateSwapMat[0, 0] for (i, j) in stateSwap: stateSwapMat[i, j] = 1 for (i, j) in actionSwap: actionSwapMat[i, j] = 1 def ssT(shape, dtype=None): if shape != stateSwapMat.shape: raise Exception("State Swap Tensor Shape Error") return K.variable(stateSwapMat, dtype=dtype) def asT(shape, dtype=None): if shape != actionSwapMat.shape: raise Exception("Action Swap Tensor Shape Error") return K.variable(actionSwapMat, dtype=dtype) model1 = Sequential() model1.add( Dense(self.nb_states, input_shape=(1, ) + self.env.observation_space.shape, trainable=False, kernel_initializer=ssT, bias_initializer='zeros')) inD = Input(shape=(1, ) + self.env.observation_space.shape) symState = model1(inD) symPol = self.actor(symState) model2 = Sequential() model2.add( Dense(self.nb_actions, input_shape=(1, self.nb_actions), trainable=False, kernel_initializer=asT, bias_initializer='zeros')) out = model2(symPol) return Model(inD, out) def fit(self, **kwargs): if 'nb_max_episode_steps' in kwargs.keys(): self.env.spec.timestep_limit = kwargs['nb_max_episode_steps'] else: self.env.spec.timestep_limit = self.env.time_limit out = self.agent.fit(self.env, **kwargs) print("\n\ndo symetric loss back propigation\n\n") states = np.random.normal( 0, 10, (kwargs['nb_steps'] // 200, 1, self.nb_states)) actions = self.actor.predict_on_batch(states) self.sym_actor.train_on_batch(states, actions) return out def test(self, **kwargs): print("testing") print("VA:", self.env.get_VA()) if 'nb_max_episode_steps' in kwargs.keys(): self.env.spec.timestep_limit = kwargs['nb_max_episode_steps'] else: self.env.spec.timestep_limit = self.env.time_limit return self.agent.test(self.env, **kwargs) def test_get_steps(self, **kwargs): return self.test(**kwargs).history['nb_steps'][-1] def save_weights(self, filename='osim-rl/ddpg_{}_weights.h5f'): self.agent.save_weights(filename.format("opensim"), overwrite=True) self.save_processor() def load_weights(self, filename='osim-rl/ddpg_{}_weights.h5f'): self.agent.load_weights(filename.format("opensim")) self.load_processor() def search_VA(self): # 1-D line search state = self.env.get_VA() goal = 0.0 if abs(state - goal) < 0.01: self.env.upd_VA(goal) return steps = self.test_get_steps(nb_episodes=1, visualize=False, nb_max_episode_steps=1000) dv = 0.0 dsteps = steps while (state - dv > goal and dsteps > 0.8 * steps): dv += 0.02 self.env.upd_VA(state - dv) dsteps = self.test_get_steps(nb_episodes=1, visualize=False, nb_max_episode_steps=1000) if abs((state - dv) - goal) < 0.01: self.env.upd_VA(goal) else: dv -= 0.02 self.env.upd_VA(state - dv) def save_processor(self): np.savez('osim-rl/processor.npz', _sum=self.processor.normalizer._sum, _count=np.array([self.processor.normalizer._count]), _sumsq=self.processor.normalizer._sumsq, mean=self.processor.normalizer.mean, std=self.processor.normalizer.std) def load_processor(self): f = np.load('osim-rl/processor.npz') dtype = f['_sum'].dtype if (self.processor.normalizer == None): self.processor.normalizer = WhiteningNormalizer( shape=(1, ) + self.env.observation_space.shape, dtype=dtype) self.processor.normalizer._sum = f['_sum'] self.processor.normalizer._count = int(f['_count'][0]) self.processor.normalizer._sumsq = f['_sumsq'] self.processor.normalizer.mean = f['mean'] self.processor.normalizer.std = f['std']
x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in tensorflow.keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(learning_rate=.001, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent.fit(env, nb_steps=50000, visualize=True, verbose=1, nb_max_episode_steps=200) # After training is done, we save the final weights. agent.save_weights(f'ddpg_{ENV_NAME}_weights.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)
# pos = env.arraystate2pos(state) # print(pos) # optimal_action = np.zeros(2) # action, optimal_action, a, b, theta = agent.test(env, nb_episodes=500000, visualize=False, nb_max_episode_steps=200, modif = True, pos = pos) # env.non_random_reset(pos[0], pos[1], pos[2]) # env.render = True # env.step(action, rand = optimal_action, a = a, b = b, theta = theta) # state = env.reset() # pos = env.arraystate2pos(state) # optimal_action = np.zeros(2) # optimal_action[0], optimal_action[1], a, b, theta = agent.test(env, nb_episodes=500000, visualize=False, nb_max_episode_steps=200, modif = True, pos = pos) # env.non_random_reset(pos[0], pos[1], pos[2]) # env.render = True # env.step(optimal_action, a = a, b = b, theta = theta) nb_test = 50 for i in range(nb_test): env.render = False state = env.reset() pos = env.arraystate2pos(state) optimal_action = np.zeros(2) action, optimal_action, a, b, theta = agent.test(env, nb_episodes=500000, visualize=False, nb_max_episode_steps=200, modif=True, pos=pos) env.non_random_reset(pos[0], pos[1], pos[2]) env.render = True env.step(action, rand=optimal_action, a=a, b=b, theta=theta)