x = Dense(32, activation='relu')(x) x = Dense(1, activation='linear')(x) critic = Model(inputs=(action_input, observation_input), outputs=x) print(critic.summary()) # Define a memory buffer for the agent, allows to learn from past experiences memory = SequentialMemory( limit=10000, window_length=window_length ) # Create a random process for exploration during training # this is essential for the DDPG algorithm random_process = OrnsteinUhlenbeckProcess( theta=0.5, mu=0.0, sigma=0.2 ) # Create the agent for DDPG learning agent = DDPGAgent( # Pass the previously defined characteristics nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, # Define the overall training parameters nb_steps_warmup_actor=2048,
def main(train_test_flag='train'): get_custom_objects().update( {'SmoothLogistic': Activation(smooth_logistic)}) model_name = '2,2,3x32Net_r4_lr{}_th{}_[t{}s{}]_nAnn[{},{}]_{}'. \ format(LR, SUCCESS_THRESHOLD, THETA, SIGMA, SIGMA_MIN, NUM_STEPS_ANNEALING, NUM_MUSCLES) muscle_labels = ["m" + str(i) for i in np.array(range(NUM_MUSCLES))] training = False weight_filename = os.path.join(c.trained_directory, '{}_weights.h5f'.format(model_name)) log_file_name = begin_time + '_' + model_name while True: try: env = PointModel2dEnv(verbose=0, success_thres=SUCCESS_THRESHOLD, dof_observation=DOF_OBSERVATIONS, include_follow=False, port=PORT, muscle_labels=muscle_labels, log_file=log_file_name) break except ConnectionRefusedError as e: print("Server not started: ", e) time.sleep(10) try: env.seed(123) nb_actions = env.action_space.shape[0] memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) mu_model = get_mu_model(env) v_model = get_v_model(env) l_model = get_l_model(env) random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=THETA, mu=MU, sigma=SIGMA, dt=DT, sigma_min=SIGMA_MIN, n_steps_annealing=NUM_STEPS_ANNEALING) # random_process = None processor = PointModel2dProcessor() agent = MuscleNAFAgent(nb_actions=nb_actions, V_model=v_model, L_model=l_model, mu_model=mu_model, memory=memory, nb_steps_warmup=WARMUP_STEPS, random_process=random_process, gamma=GAMMA, target_model_update=UPDATE_TARGET_MODEL_STEPS, processor=processor, target_episode_update=True) agent.compile(Adam(lr=LR), metrics=['mse']) env.agent = agent pprint.pprint(agent.get_config(False)) load_weights(agent, weight_filename) tensorboard = RlTensorBoard(log_dir=os.path.join( c.tensorboard_log_directory, log_file_name), histogram_freq=HISTOGRAM_FREQ, batch_size=BATCH_SIZE, write_graph=True, write_grads=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None, agent=agent) csv_logger = keras.callbacks.CSVLogger(os.path.join( c.agent_log_directory, log_file_name), append=False, separator=',') if train_test_flag == 'train': # train code training = True agent.fit(env, nb_steps=NUM_TRAINING_STEPS, visualize=False, verbose=VERBOSITY, nb_max_episode_steps=NUM_MAX_EPISODE_STEPS, callbacks=[tensorboard, csv_logger]) print('Training complete') save_weights(agent, weight_filename) elif train_test_flag == 'test': # test code training = False env.log_to_file = False history = agent.test(env, nb_episodes=NUM_EPISODES, nb_max_episode_steps=NUM_MAX_EPISODE_STEPS) print(history.history) print('Average last distance: ', np.mean(history.history['last_distance'])) print('Mean Reward: ', np.mean(history.history['episode_reward'])) except Exception as e: if training: save_weights(agent, weight_filename) print("Error in main code:", str(e)) env.net.sock.close() raise e
flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Set up the agent for training memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1.) # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1)
def main(args): sigma, learning_rate, file_prefix = args env = ModifiedArmEnv(visualize=False) input_shape = (1, ) + env.observation_space.shape nb_actions = env.action_space.shape[0] # Create actor and critic networks actor = Sequential() actor.add(Flatten(input_shape=input_shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=input_shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) # Set up the agent for training memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=sigma, dt=env.stepsize, size=env.noutput) agent = DDPGAgent( nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1., ) agent.compile(Adam(lr=learning_rate, clipnorm=1.), metrics=['mae']) # Train the model training_history = RewardsLogger() env.reset() agent.fit( env, nb_steps=100000, visualize=False, verbose=1, nb_max_episode_steps=200, log_interval=10000, callbacks=[training_history], ) # Save weights and training history agent.save_weights(file_prefix + '_weights.h5f', overwrite=True) pickledump(training_history, file_prefix + '_training_history.pkl') # Set test parameters test_nb_episodes = 10 test_nb_max_episode_steps = 1000 # Run test test_history = ObservationsLogger() env.reset() agent.test( env, nb_episodes=test_nb_episodes, visualize=False, nb_max_episode_steps=test_nb_max_episode_steps, callbacks=[test_history], ) # Save test history pickledump(test_history, file_prefix + '_test_history.pkl')
h3 = Activation('relu')(h3) h4 = Dense(HYP.HIDDEN_UNITS_2, name='Q_h4')(h3) h4 = Dropout(HYP.DROPOUT)(h4) h4 = Activation('relu')(h4) Qvalues = Dense(1, activation='linear', name='Q_last')(h4) critic = Model(inputs=[action_input, observation_input], outputs=Qvalues) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=HYP.MEMORY, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=HYP.THETA, mu=HYP.MU, sigma=HYP.SIGMA) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, batch_size=HYP.BATCH_SIZE, nb_steps_warmup_actor=HYP.WARMUP_ACTOR, nb_steps_warmup_critic=HYP.WARMUP_CRITIC, random_process=random_process, gamma=HYP.GAMMA, target_model_update=HYP.TAU) agent.compile(Adam(lr=HYP.LEARN_R, clipnorm=HYP.CLIPNORM), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for
def __init__(self): ENV_NAME = 'drone' # Get the environment and extract the number of actions. #env = gym.make(ENV_NAME) env = drone_sim() np.random.seed(123) env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # Next, we build a very simple model. self.actor = Sequential() self.actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) self.actor.add(Dense(16)) self.actor.add(Activation('relu')) self.actor.add(Dense(16)) self.actor.add(Activation('relu')) self.actor.add(Dense(16)) self.actor.add(Activation('relu')) self.actor.add( Dense(nb_actions, activation='tanh', kernel_initializer=RandomUniform())) self.actor.add(Lambda(lambda x: x * 60.0)) print(self.actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) self.agent = DDPGAgent(nb_actions=nb_actions, actor=self.actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
print(critic.summary()) filename_exp = 'exp_0' log_filename_pre = '../results/T2D1/' process_noise_std = 0*20 theta=0.15 GAMMA = 1 # GAMMA of our cumulative reward function STEPS_PER_EPISODE = 400 # No. of time-steps per episode # configure and compile our agent by using built-in Keras optimizers and the metrics! # allocate the memory by specifying the maximum no. of samples to store memory = SequentialMemory(limit=800000, window_length=1) # random process for exploration noise #random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, dt=0.01, mu=0., sigma=.2) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=theta, dt=0.01, mu=0., sigma=.35, sigma_min=0.01) # define the DDPG agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=GAMMA, target_model_update=5e-4) # compile the model agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mse']) callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre, filename_exp) # ---------------------------------------------------------------------------------------------------------------------------------------- # Training phase # fitting the agent, after training is done, we save the final weights. # agent.fit(env, nb_steps=600000, visualize=False, callbacks=callbacks, verbose=1, gamma=GAMMA, nb_max_episode_steps=STEPS_PER_EPISODE, process_noise_std=process_noise_std) # agent.save_weights(log_filename_pre+filename_exp+'/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
NB_STEPS = 50000 PRE_WARM_STEP = 0 SAVE_INTERVAL = 200 naive_env = env.unwrapped step_length = naive_env.simulation.step_length / 1000 plan_horizon = naive_env.horizon goal_length = naive_env.goal_length NB_MAX_EPISODE_STEPS = goal_length / ( (step_length * plan_horizon) * 5 ) # episode length / (times per action * min v) # turn left agent left_processor = WhiteningNormalizerProcessor() left_memory = SequentialMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGTH) left_random_process = OrnsteinUhlenbeckProcess(size=lower_nb_actions, theta=RANDOM_PROCESS_THETA, mu=RANDOM_PROCESS_MU, sigma=RANDOM_PROCESS_SIGMA) left_agent = DDPGAgent(processor=left_processor, nb_actions=lower_nb_actions, actor=left_actor_model, critic=left_critic_model, critic_action_input=critic_action_input, memory=left_memory, nb_steps_warmup_critic=NB_STEPS_WARMUP_CRITIC, nb_steps_warmup_actor=NB_STEPS_WARMUP_ACTOR, random_process=left_random_process, gamma=GAMMA, target_model_update=TARGET_MODEL_UPDATE, batch_size=BATCH_SIZE_LOWER) left_agent.compile(Adam(lr=OPTIMIZER_LR, clipnorm=OPTIMIZER_CLIPNORM), metrics=['mae'])
flattened_observation = Flatten()(observation_input) x = Concatenate()([action_input, flattened_observation]) x = Dense(32, activation='relu')(x) x = Dense(32, activation='relu')(x) x = Dense(32, activation='relu')(x) x = Dense(1, activation='linear')(x) critic = Model(inputs=(action_input, observation_input), outputs=x) print(critic.summary()) # Create a replay memory memory = SequentialMemory(limit=50000, window_length=window_length) # Create a random process for exploration during training random_process = OrnsteinUhlenbeckProcess(theta=0.5, mu=0.0, sigma=0.2, sigma_min=0.02, n_steps_annealing=150000) # Create the agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, nb_steps_warmup_actor=1024, nb_steps_warmup_critic=1024, target_model_update=1000, gamma=0.9, batch_size=128,
filename_exp='exp_s/exp_0' log_filename_pre = '../results/Pendulum/' process_noise_std = 0.5*5.8 # no ref theta=0.15 sigma=6 GAMMA=1 # GAMMA of our cumulative reward function STEPS_PER_EPISODE = 30 # No. of time-steps per episode # configure and compile our agent by using built-in Keras optimizers and the metrics! # allocate the memory by specifying the maximum no. of samples to store memory = SequentialMemory(limit=300000, window_length=1) # random process for exploration noise random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=theta, mu=0., dt=0.01, sigma=sigma) # define the DDPG agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=GAMMA, target_model_update=1e-3) # compile the model as follows agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mse']) callbacks = common_func.build_callbacks(ENV_NAME, log_filename_pre, filename_exp) # ---------------------------------------------------------------------------------------------------------------------------------------- # Training phase # fitting the agent. After training is done, save the final weights. # 240000 # agent.fit(env, nb_steps=300000, visualize=False, callbacks=callbacks, verbose=1, nb_max_episode_steps=STEPS_PER_EPISODE, process_noise_std=process_noise_std) # agent.save_weights(log_filename_pre+filename_exp+'/ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
x = merge([action_input, flattened_observation], mode='concat') x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_range=(-10., 10.)) agent.compile([RMSprop(lr=.001), RMSprop(lr=.001)], metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using
def __init__(self, env, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, batch_size=32, lr=.01, clipnorm=1., gamma=.99, target_model_update=1e-2, theta=.15, mu=0., sigma=.3, name=None): memory_len = 1 self.env = env self.name = name action_input = Input(shape=(self.env.action_space.shape[0],), name='action_input') observation_input = Input(shape=(memory_len, self.env.obs_steps, len(self.env.observation_space)), name='observation_input') processed_observation = Lambda(lambda x: K.squeeze(x, axis=1), name='processed_observation')(observation_input) # # def process_obs(obs): # obs = K.squeeze(obs, axis=1) # obs = K.(obs[:,:,1], obs[:,:,2]) ## Shared layers # self.shared_convs = [Conv1D(32, kernel_size=1, padding='same', activation='relu')] * 2 + \ # [Conv1D(32, kernel_size=6, padding='same', activation='relu')] * 3 # # self.shared_grus = [GRU(32, activation='relu', return_sequences=True)] * 3 +\ # [GRU(32, activation='relu', return_sequences=False)] ## Shared vision and sequence models # Vision model c = BatchNormalization()(processed_observation) c1 = Conv1D(8, kernel_size=1, padding='same', activation='selu')(c) # c1 = BatchNormalization()(c1) c1 = Conv1D(16, kernel_size=3, padding='same', activation='selu')(c1) c1 = MaxPool1D(strides=1, padding='same')(c1) c2 = Conv1D(8, kernel_size=1, padding='same', activation='selu')(c) # c2 = BatchNormalization()(c2) c2 = Conv1D(16, kernel_size=6, padding='same', activation='selu')(c2) c3 = MaxPool1D(strides=1, padding='same')(c2) c3 = Conv1D(8, kernel_size=6, padding='same', activation='selu')(c3) c4 = Concatenate(axis=-1)([c1, c2, c3]) # c5 = BatchNormalization()(c4) c5 = Conv1D(8, kernel_size=12, padding='same', activation='selu')(c4) # c5 = BatchNormalization()(c5) c5 = MaxPool1D(strides=1, padding='same')(c5) c5 = Conv1D(16, kernel_size=24, padding='same', activation='selu')(c5) c5 = MaxPool1D(strides=1, padding='same')(c5) c6 = Concatenate(axis=-1)([c1, c2, c3, c4, c5]) # Sequence model # b = BatchNormalization()(c6) r = GRU(16, activation='selu', return_sequences=True)(c6) # b2 = BatchNormalization()(b1) r = GRU(16, activation='selu', return_sequences=True)(r) # b3 = BatchNormalization()(b2) r = GRU(16, activation='selu', return_sequences=True)(r) # b4 = BatchNormalization()(b3) r = GRU(16, activation='selu', return_sequences=False)(r) # Shape conforming f = Flatten()(c6) k = Concatenate(axis=-1)([r, f]) # Actor voting system # a = BatchNormalization()(k) a = Dense(512, activation='selu')(k) # a = Dropout(0.2)(a) a = Concatenate(axis=-1)([a, k]) # a = BatchNormalization()(a) a = Dense(256, activation='selu')(a) # a = Dropout(0.2)(a) a = Concatenate(axis=-1)([a, k]) # a = BatchNormalization()(a) a = Dense(128, activation='selu')(a) # a = Dropout(0.2)(a) a = Concatenate(axis=-1)([a, k]) # a = BatchNormalization()(a) a = Dense(64, activation='selu')(a) # a = Dropout(0.2)(a) # a = BatchNormalization()(a) actor_out = Dense(self.env.action_space.shape[0], activation='sigmoid')(a) # Critic value estimator d = Concatenate(axis=-1)([action_input, k]) # d = BatchNormalization()(d) d = Dense(512, activation='selu')(d) # d = Dropout(0.2)(d) d = Concatenate(axis=-1)([d, k]) # d = BatchNormalization()(d) d = Dense(256, activation='selu')(d) # d = Dropout(0.2)(d) # d = BatchNormalization()(d) d = Concatenate(axis=-1)([d, k]) d = Dense(128, activation='selu')(d) # d = Dropout(0.2)(d) # d = BatchNormalization()(d) d = Concatenate(axis=-1)([d, k]) d = Dense(64, activation='selu')(d) # d = Dropout(0.2)(d) # d = BatchNormalization()(d) critic_out = Dense(self.env.action_space.shape[0], activation='sigmoid')(d) # Define and compile models self.actor = Model(inputs=observation_input, outputs=actor_out) self.critic = Model(inputs=[action_input, observation_input], outputs=critic_out) self.memory = SequentialMemory(limit=10000, window_length=memory_len) random_process = OrnsteinUhlenbeckProcess(size=self.env.action_space.shape[0], theta=theta, mu=mu, sigma=sigma) super().__init__(nb_actions=self.env.action_space.shape[0], actor=self.actor, batch_size=batch_size, critic=self.critic, critic_action_input=action_input, memory=self.memory, nb_steps_warmup_critic=nb_steps_warmup_critic, nb_steps_warmup_actor=nb_steps_warmup_actor, random_process=random_process, gamma=gamma, target_model_update=target_model_update) self.compile(Nadam(lr=lr, clipnorm=clipnorm), metrics=['mae'])
x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Set up the agent for training memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=env.get_action_space_size()) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_clip=1.) # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1)
def __init__(self, env, *args, **kwargs): super(KerasDDPGAgent, self).__init__(*args, **kwargs) self.env = env #assert len(env.action_space.shape) == 1 #TODO: is there a way to output a tuple (6,1) nb_actions = sum( sum(1 for i in row if i) for row in self.env.action_space.sample()) #TODO: terminology? feature or observation? observation = env.reset() print ">>>>>>>>>>>>>>>>>>>", observation.shape # TODO: find a way to customize network actor = Sequential() actor.add(Flatten(input_shape=(1, ) + observation.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('tanh')) actor.add(Lambda(lambda x: x * 3.14159)) print(actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + observation.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = merge([action_input, flattened_observation], mode='concat') x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) memory = SequentialMemory(limit=500000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) self.agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3) self.agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])
x = Activation('relu')(x) # Output Layer x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=2 * NUM_STEPS, window_length=1) # random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, dt=env.tau, theta=0.6, mu=0.0, sigma=0.5, sigma_min=0.15, n_steps_annealing=NUM_STEPS) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.999, target_model_update=1e-3, delta_clip=1.0)
def main(): history, stock_list, marketdate, item_list = read_stock_history() history = history[:, :, :4] target_stocks = stock_list num_training_time = 2000 window_length = 50 nb_actions = len(target_stocks) + 1 # action_dim = [nb_actions] # state_dim = [window_length, nb_actions] # batch_size = 64 # action_bound = 1. # tau = 1e-3 # learning_rate = 1e-4 # # predictor_type = 'cnn' # use_batch_norm = True # # CONFIG = {'seed': 1234, # 'episode': 1, # 'batch_size': 256, # 'gamma': 0.99, # 'buffer_size': 100, # 'max_step': 500, # 'tau': 0.001 # } # get target history target_history = np.empty(shape=(num_training_time, len(target_stocks), history.shape[2])) target_marketdate = marketdate[:num_training_time] for i, stock in enumerate(target_stocks): target_history[:, i, :] = history[:num_training_time, stock_list.index(stock), :] env = PortfolioEnv(target_history, target_stocks, target_marketdate, steps=252, window_len=window_length) np.random.seed(123) env.seed(123) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, window_length, nb_actions, 1), name='observation_input') reshaped_obs_input = Reshape( (window_length, nb_actions, 1))(observation_input) x = Conv2D(32, kernel_size=(3, 1))(reshaped_obs_input) x = BatchNormalization()(x) x = Activation('relu')(x) x = Conv2D(32, kernel_size=(1, 1))(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = Flatten()(x) x = Dense(64, activation='relu')(x) x = Dense(64, activation='relu')(x) w_init = keras.initializers.RandomUniform(minval=-0.003, maxval=0.003, seed=None) x = Dense(nb_actions, activation='softmax', kernel_initializer=w_init)(x) # x = NALU(nb_actions)(x) actor = Model(inputs=observation_input, outputs=x) # actor = Sequential() # actor.add(Conv2D(32, kernel_size=(1, 3), input_shape=state_dim + [1])) # actor.add(BatchNormalization()) # actor.add(Activation('relu')) # actor.add(Conv2D(32, kernel_size=(1, 1))) # actor.add(BatchNormalization()) # actor.add(Activation('relu')) # actor.add(Flatten()) # actor.add(Dense(64, activation='relu')) # actor.add(Dense(64, activation='relu')) # w_init = keras.initializers.RandomUniform(minval=-0.05, maxval=0.05, seed=None) # actor.add(Dense(nb_actions, activation='softmax', kernel_initializer=w_init)) print(actor.summary()) x = Conv2D(32, kernel_size=(3, 1))(reshaped_obs_input) x = BatchNormalization()(x) l1 = Activation('relu')(x) x = Conv2D(32, kernel_size=(1, 1))(l1) x = BatchNormalization()(x) l2 = Activation('relu')(x) flattened_observation = Flatten()(l2) x = Concatenate()([action_input, flattened_observation]) x = Dense(64)(x) l3 = Activation('relu')(x) x = Dense(64)(l3) l4 = Activation('relu')(x) w_init = keras.initializers.RandomUniform(minval=-0.003, maxval=0.003, seed=None) x = Dense(1, activation='linear', kernel_initializer=w_init)(l4) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. agent_history = agent.fit(env, nb_steps=1000000, visualize=False, verbose=1, nb_max_episode_steps=252) # After training is done, we save the final weights. agent.save_weights('ddpg_{}_weights.h5f'.format('RL_ENV2_TEST'), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. agent.test(env, nb_episodes=5, visualize=False, nb_max_episode_steps=252)
x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=float(args.theta), mu=0., sigma=float(args.sigma), size=env.noutput) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3, delta_range=(-100., 100.)) # agent = ContinuousDQNAgent(nb_actions=env.noutput, V_model=V_model, L_model=L_model, mu_model=mu_model, # memory=memory, nb_steps_warmup=1000, random_process=random_process, # gamma=.99, target_model_update=0.1)
def main2(mode='train', tc='plus', load_model=True, train_visualize=False, train_steps=50, n_episodes=10): history, factor_id, marketdate = factor_history() target_assets = factor_id window_length = 50 mem_size = 100 steps = 252 nb_actions = len(target_assets) # get target history import copy target_history = copy.deepcopy(history) target_marketdate = copy.deepcopy(marketdate) if tc == 'plus': trading_cost = 0.001 elif tc == 'zero': trading_cost = 0.00 else: trading_cost = -0.01 env = PortfolioEnv(target_history, target_assets, target_marketdate, steps=steps, window_length=window_length, trading_cost=trading_cost) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, window_length, nb_actions, 1), name='observation_input') reshaped_obs_input = Reshape( (window_length, nb_actions, 1))(observation_input) x = Conv2D(10, kernel_size=(30, 1))(reshaped_obs_input) x = BatchNormalization()(x) x = Activation('linear', name='actor_layer_1')(x) x = LeakyReLU()(x) x = Conv2D(10, kernel_size=(1, 1))(x) x = BatchNormalization()(x) x = Activation('linear', name='actor_layer_2')(x) x = LeakyReLU()(x) flattened_observation = Flatten()(x) x = Dense(64, activation='linear')(flattened_observation) x = LeakyReLU()(x) x = Dense(32, activation='linear')(x) x = LeakyReLU(name='actor_layer_3')(x) w_init = keras.initializers.RandomUniform(minval=-0.003, maxval=0.003, seed=None) x = Dense(nb_actions, activation='softmax', kernel_initializer=w_init)(x) actor = Model(inputs=observation_input, outputs=x) actor_intermediate_1 = Model( inputs=actor.inputs, outputs=actor.get_layer('actor_layer_1').output) actor_intermediate_2 = Model( inputs=actor.inputs, outputs=actor.get_layer('actor_layer_2').output) actor_intermediate_3 = Model( inputs=actor.inputs, outputs=actor.get_layer('actor_layer_3').output) print(actor.summary()) # x = Conv2D(32, kernel_size=(50, 1))(reshaped_obs_input) # x = BatchNormalization()(x) # x = Activation('relu')(x) # x = Conv2D(32, kernel_size=(1, 1))(x) # x = BatchNormalization()(x) # x = Activation('relu')(x) # flattened_observation = Flatten()(x) x = Concatenate()([action_input, flattened_observation]) x = Dense(64, activation='linear', name='critic_layer_1')(x) x = LeakyReLU()(x) x = Dense(32, activation='linear', name='critic_layer_2')(x) x = LeakyReLU()(x) w_init = keras.initializers.RandomUniform(minval=-0.003, maxval=0.003, seed=None) x = Dense(1, activation='linear', kernel_initializer=w_init, name='critic_layer_3')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) critic_intermediate_1 = Model( inputs=critic.inputs, outputs=critic.get_layer('critic_layer_1').output) critic_intermediate_2 = Model( inputs=critic.inputs, outputs=critic.get_layer('critic_layer_2').output) critic_intermediate_3 = Model( inputs=critic.inputs, outputs=critic.get_layer('critic_layer_3').output) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=mem_size, nb_steps_warmup_actor=mem_size, random_process=random_process, gamma=.90, target_model_update=1e-3) agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) if load_model: weights_filename = 'model_tc_{}.h5f'.format(tc) agent.load_weights(weights_filename) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. n_loop = n_episodes output_actor_1 = np.zeros([ n_loop, actor_intermediate_1.output_shape[1], actor_intermediate_1.output_shape[2] ]) output_actor_2 = np.zeros([ n_loop, actor_intermediate_2.output_shape[1], actor_intermediate_2.output_shape[2] ]) output_actor_3 = np.zeros([n_loop, actor_intermediate_3.output_shape[1]]) output_critic_1 = np.zeros([n_loop, critic_intermediate_1.output_shape[1]]) output_critic_2 = np.zeros([n_loop, critic_intermediate_2.output_shape[1]]) output_critic_3 = np.zeros([n_loop, critic_intermediate_3.output_shape[1]]) if mode == 'train': for l in range(n_loop): agent_history = agent.fit(env, nb_steps=steps * train_steps, visualize=train_visualize, verbose=1, nb_max_episode_steps=steps) agent.save_weights('model_tc_{}.h5f'.format(tc), overwrite=True) # obs, _, _ = env.src._step() recent_obs = agent.recent_observation.reshape( 1, 1, window_length, nb_actions, 1) recent_action = agent.recent_action.reshape(1, nb_actions) output_actor_1[l] = actor_intermediate_1.predict(recent_obs)[ 0, :, :, 0] output_actor_2[l] = actor_intermediate_2.predict(recent_obs)[ 0, :, :, 0] output_actor_3[l] = actor_intermediate_3.predict(recent_obs) # print("1: {}, 2:{}, 3:{}, 0's:{}, 1's:{}".format(np.mean(output_actor_1.squeeze(), axis=2), # np.mean(output_actor_2.squeeze(), axis=2), # output_actor_3, # np.sum(output_actor_3 == 0), # np.sum(output_actor_3 > 0))) output_critic_1[l] = critic_intermediate_1.predict( [recent_action, recent_obs]) output_critic_2[l] = critic_intermediate_2.predict( [recent_action, recent_obs]) output_critic_3[l] = critic_intermediate_3.predict( [recent_action, recent_obs]) print("1: {}, 2:{}, 3:{}".format(output_critic_1[l], output_critic_2[l], output_critic_3[l])) agent.test(env, nb_episodes=1, visualize=True, nb_max_episode_steps=steps) plot_layer_3d(output_actor_1) plot_layer_3d(output_actor_2) plot_layer_2d(output_actor_3) plot_layer_2d(output_critic_1) plot_layer_2d(output_critic_2) plot_layer_2d(output_critic_3) else: weights_filename = 'model_tc_{}.h5f'.format(tc) agent.load_weights(weights_filename) agent.test(env, nb_episodes=1, visualize=True, nb_max_episode_steps=steps)
def main(layers1=[200], layers2=[200], leaky_alpha=0.10, ENV_NAME='EnvPong', show=False, wall_reward=-0.1, touch_reward=0.3, n_steps=80000, n_alternances=10, L_R=0.0001, only_test=False, opp_aware=[1, 1], myopie=[0.00, 0.00], ball_speed=1.0, weights1_name='', weights2_name=''): ENV_NAME = ENV_NAME conf_name = "{}_layers1={}__layers2={}__leaky={}__lr={}__opp={}__myopia={}__speed={}".format( ENV_NAME, layers1, layers2, leaky_alpha, L_R, opp_aware, myopie, ball_speed) #gym.undo_logger_setup() # Get the environment and extract the number of actions. if ENV_NAME == 'Env2D': env = Game2D(2.) elif ENV_NAME == 'Env2DSoloSpin': env = Game2DSolo(2., spinRacket=True) elif ENV_NAME == 'Env3DSolo': env = Game3DSolo(2., 9.8, 0.5, 7., 3.) elif ENV_NAME == 'EnvPong': env = Pong(PongPlayer(None, opp_aware=(opp_aware[0] == 1)), PongPlayer(None, opp_aware=(opp_aware[1] == 1))) np.random.seed(123) #env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space_1.shape)) #actor.add(keras.layers.normalization.BatchNormalization()) for size in layers1: actor.add( Dense(size, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None))) #actor.add(keras.layers.core.Dropout(0.2)) actor.add(LeakyReLU(leaky_alpha)) #actor.add(keras.layers.normalization.BatchNormalization()) actor.add( Dense(nb_actions, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None), bias_regularizer=regularizers.l2(0.01))) #actor.add(keras.layers.core.Dropout(0.2)) actor.add(Activation('linear')) print(actor.summary()) action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space_1.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = merge([action_input, flattened_observation], mode='concat') #x = keras.layers.normalization.BatchNormalization()(x) for size in layers1: x = Dense(size)(x) #x = keras.layers.core.Dropout(0.2)(x) x = LeakyReLU(alpha=leaky_alpha)(x) #x = keras.layers.normalization.BatchNormalization()(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(input=[action_input, observation_input], output=x) print(critic.summary()) actor2 = Sequential() actor2.add(Flatten(input_shape=(1, ) + env.observation_space_2.shape)) #actor2.add(keras.layers.normalization.BatchNormalization()) for size in layers2: actor2.add( Dense(size, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None))) #actor2.add(keras.layers.core.Dropout(0.2)) actor2.add(LeakyReLU(alpha=leaky_alpha)) actor2.add( Dense(nb_actions, kernel_initializer=RandomUniform(minval=-0.005, maxval=0.005, seed=None), bias_regularizer=regularizers.l2(0.01))) #actor2.add(keras.layers.core.Dropout(0.2)) actor2.add(Activation('linear')) print(actor2.summary()) action_input2 = Input(shape=(nb_actions, ), name='action_input') observation_input2 = Input(shape=(1, ) + env.observation_space_2.shape, name='observation_input') flattened_observation2 = Flatten()(observation_input2) x2 = merge([action_input2, flattened_observation2], mode='concat') #x2 = keras.layers.normalization.BatchNormalization()(x2) for size in layers2: x2 = Dense(size)(x2) #x2 = keras.layers.core.Dropout(0.2)(x2) x2 = LeakyReLU(alpha=leaky_alpha)(x2) x2 = Dense(1)(x2) x2 = Activation('linear')(x2) critic2 = Model(input=[action_input2, observation_input2], output=x2) print(critic2.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory1 = SequentialMemory(limit=50000, window_length=1) if opp_aware[0] != opp_aware[1]: memory2 = SequentialMemory(limit=50000, window_length=1) else: memory2 = memory1 random_process1 = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.1, mu=0., sigma=.15, sigma_min=0., n_steps_annealing=n_steps / 4) # Explores less at the end ? random_process2 = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.1, mu=0., sigma=.15, sigma_min=0., n_steps_annealing=4 * n_steps) agent1 = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory1, nb_steps_warmup_critic=5000, nb_steps_warmup_actor=5000, random_process=random_process1, gamma=.99, target_model_update=1e-3, batch_size=100) agent2 = DDPGAgent(nb_actions=nb_actions, actor=actor2, critic=critic2, critic_action_input=action_input2, memory=memory2, nb_steps_warmup_critic=5000, nb_steps_warmup_actor=5000, random_process=random_process2, gamma=.99, target_model_update=1e-3, batch_size=100) #agent.compile(Adam(lr=L_R, clipnorm=1., clipvalue=0.5), metrics=['mae']) agent1.compile(Adam(lr=L_R, clipnorm=1.), metrics=['mae']) agent2.compile(Adam(lr=L_R, clipnorm=1.), metrics=['mae']) player1 = PongPlayer(agent1, myopie=myopie[0], opp_aware=(opp_aware[0] == 1)) player2 = PongPlayer(agent2, myopie=myopie[1], opp_aware=(opp_aware[1] == 1)) # Grid -4 # Add -1 when lost # CEM method directory_log = "logs/ddpg/{}".format(conf_name) directory_weights = "weights/ddpg/{}".format(conf_name) if not os.path.exists(directory_log): os.makedirs(directory_log) if not os.path.exists(directory_weights): os.makedirs(directory_weights) if only_test: '''if weights1_name =='': weights1_name = "{}/player1_final".format(directory_weights) if weights2_name == '': weights2_name = "{}/player2_final".format(directory_weights) #if os.path.isfile(weights1_name) and os.path.isfile(weights2_name): agent1.load_weights(weights1_name) agent2.load_weights(weights2_name)''' agent1.load_weights("{}/player1_{}".format(directory_weights, "final")) agent2.load_weights("{}/player1_{}".format(directory_weights, "final")) env = makeEnv(player1, player2, ENV_NAME, ball_speed=ball_speed) for i in range(10): playPong(env) confrontPlayers(env) plotStrategy(env) else: for i in range(n_alternances): print "Alternance n {} \n".format(i) def learning_rate_schedule(epoch): return L_R if ENV_NAME == 'Env2D': env = Game2D(agent2, wall_reward=wall_reward, touch_reward=touch_reward) elif ENV_NAME == 'EnvPong': env = Pong(player1, player2, wall_reward=wall_reward, touch_reward=touch_reward, ball_speed=ball_speed) agent1.fit(env, nb_steps=n_steps, visualize=False, verbose=1, until_score=True, score_to_reach=0.5, last_episodes=500, nb_max_episode_steps=None, callbacks=[ FileLogger("{}/player1_{}.h5f".format( directory_log, i)), keras.callbacks.LearningRateScheduler( learning_rate_schedule) ]) agent1.test(env, nb_episodes=100, visualize=False, nb_max_episode_steps=500, verbose=1) agent1.save_weights("{}/player1_{}".format(directory_weights, i), overwrite=True) agent1.memory = SequentialMemory(limit=500000, window_length=1) wall_reward = wall_reward * 0.8 touch_reward = touch_reward * 0.8 agent2.load_weights("{}/player1_{}".format(directory_weights, i)) print "Fin de {}".format(conf_name) env = Pong(player1, player2, wall_reward=wall_reward, touch_reward=touch_reward, ball_speed=ball_speed) #agent1.fit(env, nb_steps=150000, visualize=False, verbose=2, nb_max_episode_steps=None,callbacks=[FileLogger("logs/ddpg/{}_weights_steps_leaky_reg_bias_drop_lr{}.h5f".format(ENV_NAME,L_R), interval=100)]) agent1.save_weights("{}/player1_final".format(directory_weights), overwrite=True) agent2.save_weights("{}/player2_final".format(directory_weights), overwrite=True) agent1.test(env, nb_episodes=15, visualize=False, nb_max_episode_steps=500, verbose=2) if show == True: if ENV_NAME == 'Env2D': for i in range(10): play2D(player1=agent1, player2=agent1) elif ENV_NAME == 'EnvPong': for i in range(10): playPong(left=agent1, right=agent2)
cfg = tf.ConfigProto(allow_soft_placement=True) cfg.gpu_options.allow_growth = True env = gym.make('fooEnv_ID') n_actions = env.action_space.n #This will depend on the space you use, e.g. an action_space in box could be env.action_space.shape[0] #reference https://github.com/openai/gym/tree/master/gym/spaces #Architecture, simple feed-forward dense net inp = Input(((1, ) + env.observation_space.shape)) fl1 = Flatten()(inp) dn1 = Dense(100, activation='relu')(fl1) dn2 = Dense(100, activation='relu')(dn1) otp = Dense(n_actions, activation='linear')(dn2) DQNModel = Model(input=inp, output=otp) random_process = OrnsteinUhlenbeckProcess( ) #Optional random process, see https://github.com/keras-rl/keras-rl/blob/master/rl/random.py memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() agentDQN = DQNAgent(model=DQNModel, nb_actions=n_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, random_process=random_process) agentDQN.compile(Adam(lr=1e-3), metrics=['mae']) agentDQN.fit(env, nb_steps=10000, visualize=False)
def __init__(self, env, rl_lr, rl_memory_span): self.real_env = env act_space_shape = self.real_env.action_space.shape obs_space_shape = (self.real_env.observation_space.shape[0] + 1,) assert len(obs_space_shape) == 1 # Configure the Neural Networks of the RL-agent # 1. Actors: rl_num_hidden_layer_actor = 3 rl_num_neurons_per_layer_actor = 16 rl_actor1 = Sequential() # Actor1 is a Sequential Neural Network (MLP) rl_actor1.add(Flatten(input_shape=(1,) + obs_space_shape)) for i in range(rl_num_hidden_layer_actor): # Add the layers to the actor1 NN rl_actor1.add(Dense(rl_num_neurons_per_layer_actor, kernel_initializer=RandomUniform(minval=-1, maxval=1))) rl_actor1.add(Activation('relu')) rl_actor1.add(Dense(act_space_shape[0], kernel_initializer=RandomUniform(minval=-1, maxval=1))) rl_actor1.add(Activation('linear')) rl_actor2 = Sequential() # Actor2 is a Sequential Neural Network (MLP) rl_actor2.add(Flatten(input_shape=(1,) + obs_space_shape)) for i in range(rl_num_hidden_layer_actor): # Add the layers to the actor2 NN rl_actor2.add(Dense(rl_num_neurons_per_layer_actor, kernel_initializer=RandomUniform(minval=-1, maxval=1))) rl_actor2.add(Activation('relu')) rl_actor2.add( Dense(act_space_shape[0], kernel_initializer=RandomUniform(minval=-1, maxval=1))) rl_actor2.add(Activation('linear')) # 2. Critics: rl_num_hidden_layer_critic = 3 rl_num_neurons_per_layer_critic = 32 action_input1 = Input(shape=act_space_shape, name='action_input') observation_input1 = Input(shape=(1,) + obs_space_shape, name='observation_input') flattened_observation1 = Flatten()(observation_input1) rl_critic_nn1 = Concatenate()([action_input1, flattened_observation1]) for i in range(rl_num_hidden_layer_critic): rl_critic_nn1 = Dense(rl_num_neurons_per_layer_critic, kernel_initializer=RandomUniform(minval=-1, maxval=1))(rl_critic_nn1) rl_critic_nn1 = Activation('relu')(rl_critic_nn1) rl_critic_nn1 = Dense(1, kernel_initializer=RandomUniform(minval=-1, maxval=1))(rl_critic_nn1) rl_critic_nn1 = Activation('linear')(rl_critic_nn1) rl_critic1 = Model(inputs=[action_input1, observation_input1], outputs=rl_critic_nn1) action_input2 = Input(shape=act_space_shape, name='action_input') observation_input2 = Input(shape=(1,) + obs_space_shape, name='observation_input') flattened_observation2 = Flatten()(observation_input2) rl_critic_nn2 = Concatenate()([action_input2, flattened_observation2]) for i in range(rl_num_hidden_layer_critic): rl_critic_nn2 = Dense(rl_num_neurons_per_layer_critic, kernel_initializer=RandomUniform(minval=-1, maxval=1))(rl_critic_nn2) rl_critic_nn2 = Activation('relu')(rl_critic_nn2) rl_critic_nn2 = Dense(1, kernel_initializer=RandomUniform(minval=-1, maxval=1))(rl_critic_nn2) rl_critic_nn2 = Activation('linear')(rl_critic_nn2) rl_critic2 = Model(inputs=[action_input2, observation_input2], outputs=rl_critic_nn2) # 3. Set training parameters for the Agent and compile it rl_mem_size = int(rl_memory_span * round(1 / self.real_env.dt)) rl_memory1 = SequentialMemory(limit=rl_mem_size, window_length=1) rl_memory2 = SequentialMemory(limit=rl_mem_size, window_length=1) random_process1 = OrnsteinUhlenbeckProcess(size=act_space_shape[0], theta=.15, mu=0., sigma=.3) random_process2 = OrnsteinUhlenbeckProcess(size=act_space_shape[0], theta=.15, mu=0., sigma=.3) self.coop_agent = CoopActionOtherDDPG(nb_actions=act_space_shape[0], actor1=rl_actor1, actor2=rl_actor2, critic1=rl_critic1, critic2=rl_critic2, critic_action_input1=action_input1, critic_action_input2=action_input2, memory1=rl_memory1, memory2=rl_memory2, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process1=random_process1, random_process2=random_process2, gamma=.99, target_model_update=1e-3) self.coop_agent.compile(Adam(lr=rl_lr, clipnorm=1.), metrics=['mae'])
def main(): """Create environment, build models, train.""" #env = MarketEnv(("ES", "FUT", "GLOBEX", "USD"), obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=3) #env = MarketEnv(("EUR", "CASH", "IDEALPRO", "USD"), max_quantity=20000, quantity_increment=20000, obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=5, afterhours=False) env = MarketEnv("BTC-USD", max_quantity=10, quantity_increment=1, obs_type='time', obs_size=30, obs_xform=xform.Basic(30, 4), episode_steps=STEPS_PER_EPISODE, client_id=3, loglevel=logging.DEBUG) obs_size = np.product(env.observation_space.shape) # Actor model dropout = 0.1 actor = Sequential([ Flatten(input_shape=(1, ) + env.observation_space.shape), BatchNormalization(), Dense(obs_size, activation='relu'), GaussianDropout(dropout), BatchNormalization(), Dense(obs_size, activation='relu'), GaussianDropout(dropout), BatchNormalization(), Dense(obs_size, activation='relu'), GaussianDropout(dropout), BatchNormalization(), Dense(1, activation='tanh'), ]) print('Actor model') actor.summary() action_input = Input(shape=(1, ), name='action_input') observation_input = Input(shape=(1, ) + env.observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = BatchNormalization()(x) x = Dense(obs_size + 1, activation='relu')(x) x = GaussianDropout(dropout)(x) x = Dense(obs_size + 1, activation='relu')(x) x = GaussianDropout(dropout)(x) x = Dense(obs_size + 1, activation='relu')(x) x = GaussianDropout(dropout)(x) x = Dense(obs_size + 1, activation='relu')(x) x = GaussianDropout(dropout)(x) x = Dense(1, activation='linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print('\nCritic Model') critic.summary() memory = SequentialMemory(limit=EPISODES * STEPS_PER_EPISODE, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.5, mu=0., sigma=.5) agent = DDPGAgent( nb_actions=1, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=STEPS_PER_EPISODE * WARMUP_EPISODES, nb_steps_warmup_actor=STEPS_PER_EPISODE * WARMUP_EPISODES, random_process=random_process, gamma=0.95, target_model_update=0.01) agent.compile('rmsprop', metrics=['mae']) weights_filename = 'ddpg_{}_weights.h5f'.format(env.instrument.symbol) try: agent.load_weights(weights_filename) print( 'Using weights from {}'.format(weights_filename) ) # DDPGAgent actually uses two separate files for actor and critic derived from this filename except IOError: pass agent.fit(env, nb_steps=EPISODES * STEPS_PER_EPISODE, visualize=True, verbose=2, nb_max_episode_steps=STEPS_PER_EPISODE) agent.save_weights(weights_filename, overwrite=True)
def __init__(self, observation_space, action_space, filename='KerasDDPGAgent.h5f'): nb_actions = action_space.shape[0] # Actor network actor = Sequential() actor.add(Flatten(input_shape=(1, ) + observation_space.shape)) actor.add(Dense(256)) actor.add(Activation('relu')) actor.add(Dense(128)) actor.add(Activation('relu')) actor.add(Dense(64)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) print(actor.summary()) # Critic network action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + observation_space.shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(256)(x) x = Activation('relu')(x) x = Dense(128)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(32)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Setup Keras RL's DDPGAgent memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=nb_actions) self.agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, batch_size=128, nb_steps_warmup_critic=128, nb_steps_warmup_actor=128, random_process=random_process, gamma=.75, target_model_update=1e-2, delta_clip=2.) self.agent.compile(Adam(lr=.01, clipnorm=2.), metrics=['mae']) self.filename = filename
def __init__(self, first_player: bool, stop_ident_time=1e9, do_rl=False, learning_rate=0.01, activation_fcn='relu', learn_time_delta=0.2, rl_time_delta=0.1, epochs=2, fit_batch_size=20, learn_stack=LearningStack(), real_env=CoopPendulum(), rl_memory_span=50, wolf=0., win_lr_reduction=1, wolf_stop_rl=False): """ Sets various parameters, configures the ident, actor and critic NN and compiles the agent""" super(PartnerApproximatingLearner, self).__init__( first_player) # Call to __init__ of parent class Controller self.learn_stack = learn_stack # Controller specific LearningStack in which to save the experiences self.loosing_lr = learning_rate self.rl_lr = .001 # hyper-parameter self.win_lr_reduction = win_lr_reduction self.wolf = wolf self.wolf_stop_rl = wolf_stop_rl seed = np.random.randint(0, int(1e6)) + int( first_player ) * 100 # -> first player gets different seed than second # Configure neural network for identification: num_hidden_layer_ident = 3 num_neurons_per_layer_ident = 16 act_space_shape = real_env.action_space.shape obs_space_shape = real_env.observation_space.shape ident_nn = Sequential() ident_nn.add( Dense(num_neurons_per_layer_ident, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed), input_shape=obs_space_shape)) for i in range(num_hidden_layer_ident - 1): # Add the layers to the identification NN ident_nn.add( Dense(num_neurons_per_layer_ident, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + i))) ident_nn.add(Activation(activation_fcn)) ident_nn.add( Dense(act_space_shape[0], kernel_initializer=RandomUniform(minval=-0.0001, maxval=0.0001, seed=seed + 9))) ident_nn.add(Activation('linear')) opt = Adam(lr=learning_rate) # hyper-parameter ident_nn.compile(optimizer=opt, loss='mse') # hyper-parameter # Use the neural network inside a NNController for easy evaluation of the output: self.ident_ctrl = StaticNNController( first_player=(not self.first_player), neural_net=ident_nn) # Set other identification parameters self.ident_time_delta = learn_time_delta # simulation time between training the other_model with experience self.last_ident_time = 0 # last time ident NN was trained self.epochs = epochs # number of training epochs when its time to identify again self.fit_batch_size = fit_batch_size # size of mini batch that the batch is split into for training by Keras self.stop_ident_time = stop_ident_time # Time at which no training should occur anymore. Used for testing self.do_rl = do_rl if do_rl: self.rl_env = deepcopy(real_env) self.last_rl_time = -1 self.rl_time_delta = rl_time_delta self.rl_env.set_ctrl_other(self.ident_ctrl) try: self.u_limit = self.rl_env.action_space_u1 if first_player else self.rl_env.action_space_u2 except AttributeError: # rl_env does not have individual limits self.u_limit = self.rl_env.action_space # Configure the Neural Networks of the RL-agent # 1. Actor: rl_num_hidden_layer_actor = 3 rl_num_neurons_per_layer_actor = 16 rl_actor = Sequential( ) # Actor is a Sequential Neural Network (MLP) rl_actor.add(Flatten(input_shape=(1, ) + obs_space_shape)) for i in range(rl_num_hidden_layer_actor ): # Add the layers to the actor NN rl_actor.add( Dense(rl_num_neurons_per_layer_actor, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + 10 + i))) rl_actor.add(Activation(activation_fcn)) rl_actor.add( Dense(act_space_shape[0], kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + 19))) rl_actor.add(Activation('linear')) # 2. Critic: rl_num_hidden_layer_critic = 3 rl_num_neurons_per_layer_critic = 32 action_input = Input(shape=act_space_shape, name='action_input') observation_input = Input(shape=(1, ) + obs_space_shape, name='observation_input') flattened_observation = Flatten()(observation_input) rl_critic_nn = Concatenate()([action_input, flattened_observation]) for i in range(rl_num_hidden_layer_critic): rl_critic_nn = Dense(rl_num_neurons_per_layer_critic, kernel_initializer=RandomUniform( minval=-1, maxval=1, seed=seed + 20 + i))(rl_critic_nn) rl_critic_nn = Activation(activation_fcn)(rl_critic_nn) rl_critic_nn = Dense( 1, kernel_initializer=RandomUniform(minval=-1, maxval=1, seed=seed + 29))(rl_critic_nn) rl_critic_nn = Activation('linear')(rl_critic_nn) rl_critic = Model(inputs=[action_input, observation_input], outputs=rl_critic_nn) # 3. Set training parameters for the Agent and compile it rl_frames_per_train = 200 rl_mem_size = int( rl_memory_span * (round(1 / self.rl_time_delta) * rl_frames_per_train)) rl_memory = SequentialMemory(limit=rl_mem_size, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=act_space_shape[0], theta=.15, mu=0., sigma=.3) self.rl_agent = DDPGAgent(nb_actions=act_space_shape[0], actor=rl_actor, critic=rl_critic, critic_action_input=action_input, memory=rl_memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=.99, target_model_update=1e-3) self.rl_agent.compile(Adam(lr=self.rl_lr, clipnorm=1.), metrics=['mae']) self.rl_actor_ctrl = StaticNNController( first_player=self.first_player, neural_net=rl_actor)
log_filename_pre = '../results/Swimmer3/' process_noise_std = 0.00001 * 20 theta = 0.15 GAMMA = 1.0 # GAMMA of our cumulative reward function STEPS_PER_EPISODE = 1600 # No. of time-steps per episode # configure and compile our agent by using built-in Keras optimizers and the metrics! # allocate the memory by specifying the maximum no. of samples to store memory = SequentialMemory(limit=800000, window_length=1) # random process for exploration noise #random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, dt=0.01, mu=0., sigma=.2) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=theta, dt=0.01, mu=0., sigma=.35, sigma_min=0.05, n_steps_annealing=1500000) # define the DDPG agent agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100, random_process=random_process, gamma=GAMMA, target_model_update=5e-4)
x = Dense(32, activation='relu')(x) x = Dense(32, activation='relu')(x) x = Dense(32, activation='relu')(x) x = Dense(1, activation='linear')(x) critic = Model(inputs=(action_input, observation_input), outputs=x) print(critic.summary()) # Define a memory buffer for the agent, allows to learn from past experiences memory = SequentialMemory(limit=5000, window_length=window_length) # Create a random process for exploration during training # this is essential for the DDPG algorithm random_process = OrnsteinUhlenbeckProcess(theta=0.5, mu=0.0, sigma=0.1, dt=env.physical_system.tau, sigma_min=0.05, n_steps_annealing=85000, size=2) # Create the agent for DDPG learning agent = DDPGAgent( # Pass the previously defined characteristics nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, # Define the overall training parameters
except: print("...failed.") memory = PrioritizedExperience(memory_size=2**14, alpha=alpha0, beta=beta0, window_length=window_size) try: print("Trying to load 'OU process'", end="") random_process = pickle.load(open("random_process.pkl", "rb")) print("...done.") except: print("...failed.") random_process = OrnsteinUhlenbeckProcess( size=nb_actions, theta=.15, mu=0., sigma=.2, n_steps_annealing=nb_steps) memory_filled = memory.tree.filled_size() if memory_filled > 1024: warmup_steps = 0 else: warmup_steps = 1024 agent = DDPG_PERAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=models.action_input, memory=memory, nb_steps_warmup_critic=warmup_steps,
def controllera(t, joints, links, joint2, joint3, joint4, joint5, rewarda_ros, joint1, agent, graph1, session1): if agent.value is None: # import keras-rl in NRP through virtual env import site, os site.addsitedir( os.path.expanduser( '~/.opt/tensorflow_venv/lib/python2.7/site-packages')) from keras.models import Model, Sequential from keras.layers import Dense, Activation, Flatten, Input, concatenate from keras.optimizers import Adam, RMSprop from rl.agents import DDPGAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess from keras import backend as K from tensorflow import Session, Graph K.clear_session() obs_shape = (6, ) nb_actions = 5 # create the nets for rl agent # actor net graph1.value = Graph() with graph1.value.as_default(): session1.value = Session() with session1.value.as_default(): actor = Sequential() actor.add(Flatten(input_shape=(1, ) + obs_shape)) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(32)) actor.add(Activation('relu')) actor.add(Dense(nb_actions)) actor.add(Activation('sigmoid')) clientLogger.info('actor net init') # critic net action_input = Input(shape=(nb_actions, ), name='action_input') observation_input = Input(shape=(1, ) + obs_shape, name='observation_input') flattened_observation = Flatten()(observation_input) x = concatenate([action_input, flattened_observation]) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(64)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) clientLogger.info('critic net init') # instanstiate rl agent memory = SequentialMemory(limit=1000, window_length=1) random_process = OrnsteinUhlenbeckProcess(theta=.15, mu=0., sigma=.2, size=nb_actions) agent.value = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=10, nb_steps_warmup_actor=10, random_process=random_process, gamma=.99, batch_size=5, target_model_update=1e-3, delta_clip=1.) agent.value.training = True clientLogger.info('rl agent init') PATH = '/home/user/WORK/NRP/NRP-local/Experiments/bf_manipulation_demo/ddpg_weights.h5' if os.path.isfile(PATH): print('loading weights') agent.load_weights(PATH) clientLogger.info('weights loaded') agent.value.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae']) clientLogger.info('agent compiled - ready to use') #### run steps #graph1.value = Graph() with graph1.value.as_default(): # session1.value = Session() with session1.value.as_default(): import math import numpy as np angle_lower = links.value.pose[5].position.x angle_vel_lower = links.value.pose[7].position.x angle_upper = links.value.pose[9].position.x angle_vel_upper = links.value.pose[12].position.x # clientLogger.info('humerus_angle ', links.value.pose[15].position.y) # clientLogger.info('humerus_ang_vel ', angle_vel_lower) # clientLogger.info('radius_angle ', angle_upper) # clientLogger.info('radius_ang_vel ', angle_vel_lower) observation = np.array([ math.cos(angle_lower), math.sin(angle_lower), angle_vel_lower, math.cos(angle_upper), math.sin(angle_upper), angle_vel_upper ]) # get movement action from agent and publish to robot action = agent.value.forward(observation) clientLogger.info('agent stepped foward') # move robot joint1.send_message(std_msgs.msg.Float64(action[0])) joint2.send_message(std_msgs.msg.Float64(-action[1])) joint3.send_message(std_msgs.msg.Float64(action[2])) joint4.send_message(std_msgs.msg.Float64(action[3])) joint5.send_message(std_msgs.msg.Float64(action[4])) import math reward = \ math.sqrt(math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2) + \ math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2) + \ math.pow((links.value.pose[57].position.x - links.value.pose[4].position.x),2)) clientLogger.info('REWARD IS:', reward) rewarda_ros.send_message(reward) ## reward x müsste minimiert für runter! #-(angle_lower**2 + 0.1*angle_vel_lower**2 + # angle_upper**2 + 0.1*angle_vel_upper**2 + # 0.001*np.sum(np.power(action, 2))) #learn from the reward agent.value.backward(reward) clientLogger.info('agent stepped backward') agent.value.step = agent.value.step + 1 if agent.value.step % 20 == 0: clientLogger.info('saving weights') PATH = '/home/user/Desktop/keras_learning_weights/ddpg_weights_a.h5' agent.value.save_weights(PATH, overwrite=True) clientLogger.info('-------one step done')
flattened_observation = Flatten()(observation_input) x = Dense(400)(flattened_observation) x = Activation('relu')(x) x = Concatenate()([x, action_input]) x = Dense(300)(x) x = Activation('relu')(x) x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) print(critic.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=100000, window_length=1) random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.1) agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, nb_steps_warmup_critic=1000, nb_steps_warmup_actor=1000, random_process=random_process, gamma=.99, target_model_update=1e-3, processor=MujocoProcessor()) agent.compile([Adam(lr=1e-4), Adam(lr=1e-3)], metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this
x = Dense(1)(x) x = Activation('linear')(x) critic = Model(inputs=[action_input, observation_input], outputs=x) opti_critic = Adam(lr=LR_CRITIC) # #### SET UP THE AGENT ##### # Initialize Replay Buffer ## memory = SequentialMemory(limit=REPLAY_BUFFER_SIZE, window_length=1) # window_length : usefull for Atari game (cb d'images d'affilé on veut analysé (vitesse de la balle, etc..)) # Random process (exploration) ## random_process = OrnsteinUhlenbeckProcess(theta=THETA, mu=0, sigma=SIGMA, size=action_size) # Paramètres agent DDPG ## agent = DDPGAgent(nb_actions=action_size, actor=actor, critic=critic, critic_action_input=action_input, memory=memory, random_process=random_process, gamma=DISC_FACT, target_model_update=TARGET_MODEL_UPDATE, batch_size=BATCH_SIZE) agent.compile(optimizer=[opti_critic, opti_actor])