def configure(self, observation_space_shape, nb_actions): if self.opts.model_type == 1: # Option 1 : Simple model model = Sequential() model.add(Flatten(input_shape=(1, ) + observation_space_shape)) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) elif self.opts.model_type == 2: # Option 2: deep network model = Sequential() model.add(Flatten(input_shape=(1, ) + observation_space_shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) self.agent = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) self.agent.compile()
class KerasCEMAgent(object): ''' The cross-entropy method Learning Agent as described in http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.81.6579&rep=rep1&type=pdf ''' def __init__(self, opts): self.metadata = { 'discrete_actions': True, } self.opts = opts def configure(self, observation_space_shape, nb_actions): if self.opts.model_type == 1: # Option 1 : Simple model model = Sequential() model.add(Flatten(input_shape=(1,) + observation_space_shape)) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) elif self.opts.model_type == 2: # Option 2: deep network model = Sequential() model.add(Flatten(input_shape=(1,) + observation_space_shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) self.agent = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) self.agent.compile() def train(self, env, nb_steps, visualize, verbosity): # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. self.agent.fit(env, nb_steps=nb_steps, visualize=visualize, verbose=verbosity) def test(self, env, nb_episodes, visualize): # Finally, evaluate our algorithm for 5 episodes. self.agent.test(env, nb_episodes=nb_episodes, visualize=visualize) def load_weights(self, load_file): self.agent.load_weights(load_file) def save_weights(self, save_file, overwrite): # After training is done, we save the best weights. self.agent.save_weights(save_file, overwrite=overwrite)
def __init__(self, env, timesteps_per_episode=10001): super().__init__(env, timesteps_per_episode) self.evaluating = False self.action_size = env.action_space.n self.state_size = env.num_states self.model = self._build_compile_model() memory = EpisodeParameterMemory(limit=1000, window_length=1) self.agent = CEMAgent(model=self.model, nb_actions=self.action_size, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
def main(env_name, nb_steps): # Get the environment and extract the number of actions. env = gym.make(env_name) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n input_shape = (1, ) + env.observation_space.shape model = create_nn_model(input_shape, nb_actions) # Finally, we configure and compile our agent. memory = EpisodeParameterMemory(limit=450, window_length=1) agent = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) agent.compile() agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1) # After training is done, we save the best weights. agent.save_weights('cem_{}_params.h5f'.format(env_name), overwrite=True) # Finally, evaluate the agent history = agent.test(env, nb_episodes=100, visualize=False) rewards = np.array(history.history['episode_reward']) print(("Test rewards (#episodes={}): mean={:>5.2f}, std={:>5.2f}, " "min={:>5.2f}, max={:>5.2f}").format(len(rewards), rewards.mean(), rewards.std(), rewards.min(), rewards.max()))
def main(env_name, nb_steps): # Get the environment and extract the number of actions. env = gym.make(env_name) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n input_shape = (1,) + env.observation_space.shape model = create_nn_model(input_shape, nb_actions) # Finally, we configure and compile our agent. memory = EpisodeParameterMemory(limit=450, window_length=1) agent = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) agent.compile() agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1) # After training is done, we save the best weights. agent.save_weights('cem_{}_params.h5f'.format(env_name), overwrite=True) # Finally, evaluate the agent history = agent.test(env, nb_episodes=100, visualize=False) rewards = np.array(history.history['episode_reward']) print(("Test rewards (#episodes={}): mean={:>5.2f}, std={:>5.2f}, " "min={:>5.2f}, max={:>5.2f}") .format(len(rewards), rewards.mean(), rewards.std(), rewards.min(), rewards.max()))
def reinforce_train_cem(self, steps=60000, visualize=False, verbose=1, nb_steps_warmup=10000, save_path=r"D:\Data\markets\weights", save_weights_name="cem_CADJPY_weights.h5f", log_interval=1000): memory = EpisodeParameterMemory(limit=200, window_length=1) nb_actions = self.env.action_space.n agent = CEMAgent( model=self.model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, processor=MultiInputProcessor(nb_inputs=len(self.model.inputs))) agent.compile() agent.fit(self.env, nb_steps=steps, visualize=visualize, verbose=verbose, log_interval=log_interval) pathlib.Path(save_path).mkdir(parents=True, exist_ok=True) file_path = os.path.join(save_path, save_weights_name) agent.save_weights(filepath=file_path, overwrite=True)
def create_cem_agent(env): ''' create cem agent ''' env = create_environment() model = create_deep_model(env) nb_actions = env.action_space.n memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent( model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() return cem
def main(): """Build model and train on environment.""" env = MarketEnv(("ES", "FUT", "GLOBEX", "USD"), obs_xform=xform.BinaryDelta(3), episode_steps=STEPS_PER_EPISODE, client_id=3) #env = MarketEnv(("AAPL", "STK", "SMART", "USD"), obs_xform=xform.BinaryDelta(3), episode_steps=STEPS_PER_EPISODE, client_id=4) nb_actions = 3 # Keras-RL CEM is a discrete agent # Option 1 : Simple model model = Sequential([ Flatten(input_shape=(1,) + env.observation_space.shape), Dense(nb_actions), Activation('softmax') ]) # Option 2: deep network # hidden_nodes = reduce(operator.imul, env.observation_space.shape, 1) # model = Sequential([ # Flatten(input_shape=(1,) + env.observation_space.shape), # Dense(hidden_nodes), # Activation('relu'), # Dense(hidden_nodes), # Activation('relu'), # Dense(hidden_nodes), # Activation('relu'), # Dense(nb_actions), # Activation('softmax') # ]) print(model.summary()) param_logger = CEMParamLogger('cem_{}_params.json'.format(env.instrument.symbol)) callbacks = [ param_logger, FileLogger('cem_{}_log.json'.format(env.instrument.symbol), interval=STEPS_PER_EPISODE) ] theta_init = param_logger.read_params() # Start with last saved params if present if theta_init is not None: print('Starting with parameters from {}:\n{}'.format(param_logger.params_filename, theta_init)) memory = EpisodeParameterMemory(limit=EPISODES, window_length=1) # Remember the parameters and rewards for the last `limit` episodes. cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=EPISODES, nb_steps_warmup=WARMUMP_EPISODES * STEPS_PER_EPISODE, train_interval=TRAIN_INTERVAL_EPISODES, elite_frac=0.2, theta_init=theta_init, processor=DiscreteProcessor(), noise_decay_const=0, noise_ampl=0) """ :param memory: Remembers the parameters and rewards for the last `limit` episodes. :param int batch_size: Randomly sample this many episode parameters from memory before taking the top `elite_frac` to construct the next gen parameters from. :param int nb_steps_warmup: Run for this many steps (total) to fill memory before training :param int train_interval: Train (update parameters) every this many episodes :param float elite_frac: Take this top fraction of the `batch_size` randomly sampled parameters from the episode memory to construct new parameters. """ cem.compile() cem.fit(env, nb_steps=STEPS_PER_EPISODE * EPISODES, visualize=True, verbose=2, callbacks=callbacks) cem.save_weights('cem_{}_weights.h5f'.format(env.instrument.symbol), overwrite=True)
def train_implementation(self, train_context: core.CemTrainContext): assert train_context cc: core.CemTrainContext = train_context train_env = self._create_env() keras_model = self._create_model(gym_env=train_env, activation='softmax') policy_buffer_size = 5 * cc.num_episodes_per_iteration self.log_api(f'EpisodeParameterMemory', f'(limit={policy_buffer_size}, window_length=1)') memory = EpisodeParameterMemory(limit=policy_buffer_size, window_length=1) num_actions = train_env.action_space.n self.log_api(f'CEMAgent', f'(model=..., nb_actions={num_actions}, memory=..., ' + \ f'nb_steps_warmup={cc.num_steps_buffer_preload}, ' + \ f'train_interval={cc.num_episodes_per_iteration}, ' + \ f'batch_size={cc.num_episodes_per_iteration}, ' + \ f'elite_frac={cc.elite_set_fraction})') self._agent = CEMAgent(model=keras_model, nb_actions=num_actions, memory=memory, nb_steps_warmup=cc.num_steps_buffer_preload, batch_size=cc.num_episodes_per_iteration, train_interval=cc.num_episodes_per_iteration, elite_frac=cc.elite_set_fraction) self.log_api(f'agent.compile', '()') self._agent.compile() nb_steps = cc.num_iterations * cc.num_episodes_per_iteration * cc.max_steps_per_episode callback = KerasRlCemAgent.CemCallback(self, cc, nb_steps) self.on_train_iteration_begin() self.log_api(f'agent.fit', f'(train_env, nb_steps={nb_steps})') self._agent.fit(train_env, nb_steps=nb_steps, visualize=False, verbose=0, callbacks=[callback]) if not cc.training_done: self.on_train_iteration_end(math.nan)
def create(env): np.random.seed(config.current.domain_seed) env.seed(config.current.domain_seed) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] # Option 1 : Simple model #model = Sequential() #model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) #model.add(Dense(nb_actions)) #model.add(Activation('softmax')) # Option 2: deep network model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() return cem
def test_single_cem_input(): model = Sequential() model.add(Flatten(input_shape=(2, 3))) model.add(Dense(2)) memory = EpisodeParameterMemory(limit=10, window_length=2) agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, train_interval=50) agent.compile() agent.fit(MultiInputTestEnv((3,)), nb_steps=100)
def test_multi_cem_input(): input1 = Input(shape=(2, 3)) input2 = Input(shape=(2, 4)) x = Concatenate()([input1, input2]) x = Flatten()(x) x = Dense(2)(x) model = Model(inputs=[input1, input2], outputs=x) memory = EpisodeParameterMemory(limit=10, window_length=2) processor = MultiInputProcessor(nb_inputs=2) agent = CEMAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, processor=processor, train_interval=50) agent.compile() agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=100)
print(env.observation_space.shape) # # Option 1 : Simple model # model = Sequential() # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) # model.add(Dense(nb_actions)) # model.add(Activation('softmax')) # model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(32)) model.add(Activation('relu')) model.add(Dropout(0.4)) model.add(Dense(124)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() # cem.fit(env, nb_steps=200000, visualize=False, verbose=2) # cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) cem.load_weights('cem_{}_params.h5f'.format(ENV_NAME)) cem.test(env, nb_episodes=10, visualize=True)
model.add(Activation('relu')) model.add(Dense(32)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() cem.load_weights('cem_{}_params.h5f'.format("citizen-0")) for move in range(100): action = cem.forward(env.citizens[0].vision) print(action) env.step(action) print(env.citizens[0].score) time.sleep(0.05) print(env)
def main(options): # store args model_type = options.model_type train_interval_cem = options.train_interval_cem batch_size_cem = options.batch_size_cem steps_cem = options.steps_cem batch_size_props = options.batch_size_props steps_props = options.steps_props trunc_thres = options.trunc_thres Lmax = options.Lmax delta = options.delta # CEM # init environment env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] model = initModel(model_type, nb_actions, env.observation_space.shape) memory = initMemory() cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=batch_size_cem, nb_steps_warmup=1000, train_interval=train_interval_cem, elite_frac=0.05) cem.compile() callback_cem = cem.fit(env, nb_steps=steps_cem, visualize=False, verbose=0) cem.save_weights('cem_dumps/cem_{}_{}_ti_{}_bs_{}_steps_{}.h5f'.format( ENV_NAME, model_type, train_interval_cem, batch_size_cem, steps_cem), overwrite=True) #cem.test(env, nb_episodes=1, visualize=False) # PROPS # init environment env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] model = initModel(model_type, nb_actions, env.observation_space.shape) memory = initMemory() bound_opts = { 'analytic_jac': True, 'normalize_weights': True, 'truncate_weights': True, 'truncate_thresh': trunc_thres } props = PROPSAgent(model=model, nb_actions=nb_actions, memory=memory, Lmax=Lmax, delta=delta, bound_opts=bound_opts, batch_size=batch_size_props) props.compile() callback_props = props.fit(env, nb_steps=steps_props, visualize=False, verbose=0) props.save_weights( 'props_dumps/props_{}_{}_bs_{}_steps_{}_thres_{}_Lmax_{}_delta_{}.h5f'. format(ENV_NAME, model_type, batch_size_props, steps_props, trunc_thres, Lmax, delta), overwrite=True) #props.test(env, nb_episodes=1, visualize=False) df_cem = pd.DataFrame({'data': callback_cem.history['episode_reward']}) #plt.plot(callback_cem.history['episode_reward']) plt.plot(df_cem.rolling(window=train_interval_cem).mean()) df_props = pd.DataFrame({'data': callback_props.history['episode_reward']}) #plt.plot(callback_props.history['episode_reward']) plt.plot(df_props.rolling(window=batch_size_props).mean()) plt.legend(['cem', 'props'], loc='upper left') #plt.show() plt.savefig('plots/{}_{}_bs_{}_thres_{}_Lmax_{}_delta_{}.jpeg'.format( ENV_NAME, model_type, batch_size_props, trunc_thres, Lmax, delta))
n_action = len(env_player.action_space) memory = EpisodeParameterMemory(limit=10000, window_length=1) model = Sequential() model.add(Flatten(input_shape=(1, 10))) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(n_action)) model.add(Activation('softmax')) agent = CEMAgent(model=model, nb_actions=n_action, memory=memory, batch_size=50, nb_steps_warmup=1000, train_interval=50, elite_frac=0.05, noise_ampl=4) agent.compile() # Training env_player.play_against( env_algorithm=agent_training, opponent=random_opponent, env_algorithm_kwargs={"agent": agent, "nb_steps": NB_TRAINING_STEPS, "filename": TRAINING_OPPONENT}, ) model.save("model_%d" % NB_TRAINING_STEPS) # Evaluation print("Results against random player:") env_player.play_against(
def run_agent(agent): print("started new process") import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) WINDOW_LENGTH = 1 num_actions = 3 view_shape = (21, 21) input_shape = (WINDOW_LENGTH, ) + view_shape env = RestrictedViewTronEnv(agent, 10) model = Sequential() model.add(Permute((2, 3, 1), input_shape=input_shape)) model.add(Conv2D(16, (3, 3), padding="same")) model.add(Activation("relu")) model.add(Conv2D(32, (3, 3), padding="same")) model.add(Activation("relu")) model.add(Flatten()) model.add(Dense(256)) model.add(Activation("relu")) model.add(Dense(num_actions)) model.add(Activation('softmax')) np.random.seed(2363) #policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=2., # value_min=.1, value_test=.1, nb_steps=1000000 // 10) processor = TronProcessor() memory = EpisodeParameterMemory(limit=1000000, window_length=WINDOW_LENGTH) cem = CEMAgent(model, nb_actions=num_actions, memory=memory, nb_steps_warmup=50000 // 5, train_interval=4) #dqn.compile(Adam(lr=.00025), metrics=["mae"]) cem.compile() weights_filename = 'tmp/dqn_test_weights.h5f' checkpoint_weights_filename = 'tmp/dqn_test_weights_{step}.h5f' log_filename = 'tmp/dqn_test_log.json' callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000 // 10) ] callbacks += [FileLogger(log_filename, interval=10000)] def train(transfer=False): print(cem.get_config()) # todo save to file if transfer: cem.load_weights(weights_filename) cem.fit(env, callbacks=callbacks, nb_steps=1750000 // 10, log_interval=10000) cem.save_weights(weights_filename, overwrite=True) cem.test(env, nb_episodes=20, visualize=True) def opponent(): cem.load_weights('tmp/dqn_test_weights.h5f') cem.test(env, nb_episodes=200000, visualize=False) def test(): cem.load_weights('tmp/dqn_test_weights.h5f') cem.test(env, nb_episodes=20, visualize=True) # opponent() train() # True
out = keras.layers.Concatenate()(dense_out2) return keras.models.Model(inp, out) model = model_fn() # print(model.summary()) memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=se.action_space, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. history = cem.fit(se, nb_steps=50000, visualize=False, verbose=2) rewards = [x for x in history.history['episode_reward'] if x > 0] import matplotlib.pyplot as plt plt.plot(np.convolve(np.ones(100), rewards, 'valid'))
model.add(Dense(32)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=10000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=1000, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05, noise_decay_const=0.0, noise_ampl=1.0, processor=MujocoProcessor()) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=100000, visualize=False, verbose=2) # After training is done, we save the best weights. cem.save_weights('cem_CAV_params.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes.
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() # Okay, now it's time to learn something! We visualiz e the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=100000, visualize=False, verbose=2) # After training is done, we save the best weights. cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. cem.test(env, nb_episodes=5, visualize=True)
def main(params=None): """ performs training and evaluation of params :return: model """ if params is None: params = { 'model_type': 'dqn_agent', 'l1_out': 128, 'l2_out': 64, 'gamma': 0.5, 'target_model_update': 1, 'delta_clip': 0.01, 'nb_steps_warmup': 1000 } model_type = 'dqn_agent' env_player = SimpleRLPlayer(battle_format="gen8randombattle") # print('env_player',env_player) # print('help', help(env_player)) env_player2 = SimpleRLPlayer(battle_format="gen8randombattle") opponent = RandomPlayer(battle_format="gen8randombattle") second_opponent = MaxDamagePlayer(battle_format="gen8randombattle") # Output dimension n_action = len(env_player.action_space) # model_params = { # 'n_actions': n_action, # 'l1_out': 128, # 'l2_out': 64, # 'model_type': params['model_type'] # } model_params = params model_params['n_actions'] = n_action model = get_model(model_params) # print('first model summary') # print(model.summary()) # model = Sequential() # model.add(Dense(128, activation="elu", input_shape=(1, 10))) # # # Our embedding have shape (1, 10), which affects our hidden layer # # dimension and output dimension # # Flattening resolve potential issues that would arise otherwise # model.add(Flatten()) # model.add(Dense(64, activation="elu")) # model.add(Dense(n_action, activation="linear")) # elu activation is similar to relu # https://ml-cheatsheet.readthedocs.io/en/latest/activation_functions.html#elu # determine memory type if params['model_type'] in {'dqn_agent', 'sarsa_agent'}: # memory = SequentialMemory(limit=10000, window_length=1) memory = SequentialMemory(limit=NB_TRAINING_STEPS, window_length=1) else: memory = EpisodeParameterMemory(limit=10000, window_length=1) # Simple epsilon greedy # What is linear annealed policy? # - this policy gives gradually decreasing thresholds for the epsilon greedy policy # - it acts as a wrapper around epsilon greedy to feed in a custom threshold pol_steps = NB_TRAINING_STEPS policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=pol_steps, ) # pol_steps = NB_TRAINING_STEPS policy_boltz = BoltzmannQPolicy(tau=1) # policy = LinearAnnealedPolicy( # BoltzmannQPolicy(), # attr="tau", # value_max=1.0, # value_min=0.05, # value_test=0, # nb_steps=pol_steps, # ) policy = policy_boltz # Defining our DQN # model = tf.keras.models.load_model('dqn_v_dqn') if params['model_type'] == 'dqn_agent': dqn = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) elif params['model_type'] == 'sarsa_agent': dqn = SARSAAgent(model=model, nb_actions=len(env_player.action_space), policy=policy, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], delta_clip=params['delta_clip']) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) else: # CEMAgent # https://towardsdatascience.com/cross-entropy-method-for-reinforcement-learning-2b6de2a4f3a0 dqn = CEMAgent(model=model, nb_actions=len(env_player.action_space), memory=memory, nb_steps_warmup=params['nb_steps_warmup']) # different compile function dqn.compile() # dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # opponent dqn dqn_opponent = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=params['nb_steps_warmup'], gamma=params['gamma'], target_model_update=params['target_model_update'], # delta_clip=0.01, delta_clip=params['delta_clip'], enable_double_dqn=params['enable_double_dqn__'], enable_dueling_network=params['enable_double_dqn__'], dueling_type=params['dueling_type__']) dqn_opponent.compile(Adam(lr=0.00025), metrics=["mae"]) # NB_TRAINING_STEPS = NB_TRAINING_STEPS # rl_opponent = TrainedRLPlayer(model) # Training rounds = 4 n_steps = NB_TRAINING_STEPS // rounds for k in range(rounds): env_player.play_against( env_algorithm=dqn_training, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) env_player.play_against( env_algorithm=dqn_training, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": n_steps }, ) name = params["name"] + "_model" model.save(name) # loaded_model = tf.keras.models.load_model(name) # Evaluation print("Results against random player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) print("\nResults against max player:") env_player.play_against( env_algorithm=dqn_evaluation, opponent=second_opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_episodes": NB_EVALUATION_EPISODES }, ) return model
class KerasCEMAgent(AbstractAgent): def __init__(self, env, timesteps_per_episode=10001): super().__init__(env, timesteps_per_episode) self.evaluating = False self.action_size = env.action_space.n self.state_size = env.num_states self.model = self._build_compile_model() memory = EpisodeParameterMemory(limit=1000, window_length=1) self.agent = CEMAgent(model=self.model, nb_actions=self.action_size, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) def run(self) -> {str: float}: """ The agent's training method. Returns: a dictionary - {"episode_reward_mean": __, "episode_reward_min": __, "episode_reward_max": __, "episode_len_mean": __} """ self.agent.compile() history = self.agent.fit(self.env, nb_steps=ITER_NUM, visualize=False, verbose=1) if len(history.history) > 0: episode_reward = history.history["episode_reward"] nb_episode_steps = history.history["nb_episode_steps"] else: episode_reward, nb_episode_steps = [0], [0] # TODO - placeholder result = {EPISODE_REWARD_MEAN: np.array(episode_reward), EPISODE_STEP_NUM_MEAN: np.array(nb_episode_steps), EPISODE_REWARD_MIN: np.empty([]), EPISODE_REWARD_MAX: np.empty([]), EPISODE_VARIANCE: np.empty([])} return result def _build_compile_model(self): ## simple net # model = Sequential() # model.add(Flatten(input_shape=(1,) + self.env.observation_space.shape)) # model.add(Dense(self.action_size)) # model.add(Activation('softmax')) # return model model = Sequential() model.add(Flatten(input_shape=(1,) + self.env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(self.action_size)) model.add(Activation('softmax')) return model def compute_action(self, state) -> int: """ Computes the best action from a given state. Returns: a int that represents the best action. """ # state = np.array([[state]]) return int(np.argmax(self.model.predict(state))) def stop_episode(self): pass def episode_callback(self, state, action, reward, next_state, terminated): pass def evaluate(self, visualize=False): self.agent.test(self.env, nb_episodes=5, visualize=visualize, nb_max_episode_steps=60) def replay_experiences(self): pass
class KerasRlCemAgent(KerasRlAgent): """Keras-rl implementation of the cross-entropy method algorithm. see "https://learning.mpi-sws.org/mlss2016/slides/2016-MLSS-RL.pdf" and "https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.81.6579&rep=rep1&type=pdf" """ class CemCallback(rl.callbacks.Callback): """Callback registered with keras rl agents to propagate iteration and episode updates.""" def __init__(self, cem_agent: KerasRlAgent, cem_context: core.CemTrainContext, nb_steps: int): """ Args: cem_agent: the agent to propagate iteration begn/end events to. cem_context: the train_context containing the iteration definitions nb_steps: value set in the keras cem agent. """ assert cem_agent assert cem_context assert nb_steps self._cem_agent: KerasRlAgent = cem_agent self._cem_context: core.CemTrainContext = cem_context self._nb_steps = nb_steps super().__init__() def on_episode_end(self, episode, logs=None): """Signals the base class the end / begin of a training iteration.""" cc: core.CemTrainContext = self._cem_context episode = episode + 1 if episode % cc.num_episodes_per_iteration == 0: self._cem_agent.on_train_iteration_end(math.nan) if self._cem_context.training_done: self._cem_agent._agent.step = self._nb_steps else: self._cem_agent.on_train_iteration_begin() def train_implementation(self, train_context: core.CemTrainContext): assert train_context cc: core.CemTrainContext = train_context train_env = self._create_env() keras_model = self._create_model(gym_env=train_env, activation='softmax') policy_buffer_size = 5 * cc.num_episodes_per_iteration self.log_api(f'EpisodeParameterMemory', f'(limit={policy_buffer_size}, window_length=1)') memory = EpisodeParameterMemory(limit=policy_buffer_size, window_length=1) num_actions = train_env.action_space.n self.log_api(f'CEMAgent', f'(model=..., nb_actions={num_actions}, memory=..., ' + \ f'nb_steps_warmup={cc.num_steps_buffer_preload}, ' + \ f'train_interval={cc.num_episodes_per_iteration}, ' + \ f'batch_size={cc.num_episodes_per_iteration}, ' + \ f'elite_frac={cc.elite_set_fraction})') self._agent = CEMAgent(model=keras_model, nb_actions=num_actions, memory=memory, nb_steps_warmup=cc.num_steps_buffer_preload, batch_size=cc.num_episodes_per_iteration, train_interval=cc.num_episodes_per_iteration, elite_frac=cc.elite_set_fraction) self.log_api(f'agent.compile', '()') self._agent.compile() nb_steps = cc.num_iterations * cc.num_episodes_per_iteration * cc.max_steps_per_episode callback = KerasRlCemAgent.CemCallback(self, cc, nb_steps) self.on_train_iteration_begin() self.log_api(f'agent.fit', f'(train_env, nb_steps={nb_steps})') self._agent.fit(train_env, nb_steps=nb_steps, visualize=False, verbose=0, callbacks=[callback]) if not cc.training_done: self.on_train_iteration_end(math.nan)
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Option 1 : Simple model # model = Sequential() # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) # model.add(Dense(nb_actions)) # model.add(Activation('softmax')) # Option 2: deep network model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) model.summary() # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) if REWARD == "normal": cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() history_normal = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) cem.save_weights(os.path.join(LOG_DIR, 'cem_normal_{}_params.h5f'.format(ENV_NAME)), overwrite=True) cem.test(env, nb_episodes=5, visualize=False) pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": if not SMOOTH: processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=False) else: processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False) # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05, processor=processor_noisy) cem.compile() history_noisy = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) if not SMOOTH: cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) else: cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) cem.test(env, nb_episodes=5, visualize=False) elif REWARD == "surrogate": if not SMOOTH: processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True) else: processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True) # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05, processor=processor_surrogate) cem.compile() history_surrogate = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) if not SMOOTH: cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) else: cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) cem.test(env, nb_episodes=5, visualize=False) else: raise NotImplementedError
model.add(Reshape(env.observation_space.shape)) model.add( Conv2D(32, (3, 3), activation='relu', input_shape=env.observation_space.shape)) model.add(MaxPooling2D((2, 2))) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(Flatten()) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) memory = EpisodeParameterMemory(limit=10000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, batch_size=50, train_interval=50, elite_frac=0.1) cem.compile() cem.fit(env, nb_steps=100000000, visualize=False) #dqn.load_weights('dqn_test_run_weights.h5f') cem.save_weights('cem_{}_weights.h5f'.format('test_run'), overwrite=True) #dqn.test(env, nb_episodes=5, visualize=True)
# Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] modelL = Sequential() modelL.add(LSTM(nb_actions, input_shape=(1, ) + env.observation_space.shape)) modelL.add(Activation('softmax')) memoryL = EpisodeParameterMemory(limit=1000, window_length=1) cemL = CEMAgent(model=modelL, nb_actions=nb_actions, memory=memoryL, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cemL.compile() histL = cemL.fit(env, nb_steps=50000, visualize=False, verbose=1) cemL.save_weights('cemL_{}_params.h5f'.format(ENV_NAME), overwrite=True) cemL.test(env, nb_episodes=5, visualize=True) modelD = Sequential() modelD.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) modelD.add(Dense(nb_actions)) modelD.add(Activation('softmax')) memoryD = EpisodeParameterMemory(limit=1000, window_length=1) cemD = CEMAgent(model=modelD, nb_actions=nb_actions,
model.add( Dense(16) ) model.add( Activation('relu') ) model.add( Dense(16) ) model.add( Activation('relu') ) model.add( Dense(nb_actions) ) model.add( Reshape( [1,nb_actions] ) ) #model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=1000, window_length=1) #TODO write your own agent! cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=10000, visualize=False, verbose=2) # After training is done, we save the best weights. #cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. cem.test(env, nb_episodes=5, visualize=True)
np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape, name = 'Duda-flatten')) model.add(Dense(nb_actions, name = 'Duda-dense')) model.add(Activation('softmax', name = 'Duda-relu')) print(model.summary()) memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() print('\033[93m' + "Model is compiled"+'\033[0m') cem.fit(env, nb_steps=1000, visualize=True, verbose=2) print('\033[93m' + "Training"+'\033[0m') cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. cem.test(env, nb_episodes=5, visualize=True) #observations = np.array((4,10)) observations = [] for _ in range(10): observations.append(deepcopy(env.reset())) observations = np.asarray(observations)
import numpy as np import gym from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.agents.cem import CEMAgent from rl.memory import EpisodeParameterMemory ENV_NAME = 'CartPole-v0' env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) memory = EpisodeParameterMemory(limit=1000, window_length=1) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) cem.compile() cem.fit(env, nb_steps=100000, visualize=False, verbose=2) cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) cem.test(env, nb_episodes=5, visualize=True)
# model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(nb_actions)) # model.add(Activation('softmax')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = EpisodeParameterMemory(limit=MEMORY_LIMIT, window_length=WINDOW_LENGHT) cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, batch_size=BATCH_SIZE, nb_steps_warmup=NB_STEPS_WARMUP, train_interval=TRAIN_INTERVAL, elite_frac=ELITE_FRAC) cem.compile() # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. cem.fit(env, nb_steps=NB_STEPS, visualize=VISUALIZE_TRAIN, verbose=VERBOSE) # After training is done, we save the best weights. cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. cem.test(env, nb_episodes=NB_EPISODES, visualize=VISUALIZE_TEST)