def parse_policy(args) -> Policy: pol: Policy = EpsGreedyQPolicy() if args.policy == 'LinearAnnealedPolicy': pol = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=0.05, nb_steps=args.zeta_nb_steps) if args.policy == 'SoftmaxPolicy': pol = SoftmaxPolicy() if args.policy == 'EpsGreedyQPolicy': pol = EpsGreedyQPolicy() if args.policy == 'GreedyQPolicy': pol = GreedyQPolicy() if args.policy == 'BoltzmannQPolicy': pol = BoltzmannQPolicy() if args.policy == 'MaxBoltzmannQPolicy': pol = MaxBoltzmannQPolicy() if args.policy == 'BoltzmannGumbelQPolicy': pol = BoltzmannGumbelQPolicy() if args.policy == 'ZetaPolicy': pol = ZetaPolicy(zeta_nb_steps=args.zeta_nb_steps, eps=args.eps) return pol
def main(model_dir, visualize, params): print(params) env = gym.make('CartPole-v1') env._max_episode_steps = 2000 env = trl.env.TimeExpanded(env, 3) np.random.seed(params.seed) env.seed(params.seed) agent = trl.prototype.DQN(model_fn, model_dir, params=params) agent.train( env, lambda: input_fn(env), max_steps=params.max_steps, policy=MaxBoltzmannQPolicy(eps=0.9), memory=SequentialMemory( limit=params.memory_limit, window_length=1, ), target_model_update=params.target_model_update, gamma=params.gamma, warmup_steps=params.warmup_steps, batch_size=params.batch_size, summary_steps=params.summary_steps, save_steps=params.save_steps, visualize=visualize, seed=params.seed, )
def _main(): logging.basicConfig( stream=sys.stderr, level=logging.WARNING, # if args.verbose > 0 else logging.INFO, format="%(levelname)-4.4s [%(name)s:%(lineno)s] %(message)s", ) nb_actions = FREnv.action_space.n nb_steps = 1_000_000 model = Sequential() model.add(Flatten(input_shape=(1, ) + FREnv.observation_space.shape)) model.add(Dense(nb_actions * 8)) model.add(Activation("relu")) # model.add(Dense(nb_actions * 4)) # model.add(Activation('relu')) model.add(Dense(nb_actions * 2)) model.add(Activation("relu")) model.add(Dense(nb_actions)) model.add(Activation("linear")) print(model.summary()) memory = SequentialMemory(limit=50000, window_length=1) policy = LinearAnnealedPolicy( inner_policy=MaxBoltzmannQPolicy(), attr="eps", value_max=1, value_min=0.05, value_test=0, nb_steps=nb_steps // 2, ) # BoltzmannQPolicy() agent = AvailableAgent( model=model, gamma=0.9999, nb_actions=nb_actions, memory=memory, nb_steps_warmup=50, target_model_update=1e-2, policy=policy, test_policy=policy, ) agent.compile(Adam(lr=1e-3), metrics=["mae"]) if os.path.isfile(WEIGHTS_FILE): print(f"loading pre-trained weights from {WEIGHTS_FILE}") agent.load_weights(WEIGHTS_FILE) env = FREnv(team=AgentTeam(agent=agent, colors="blue")) agent.fit(env, nb_steps=nb_steps, visualize=False, verbose=1) agent.save_weights(WEIGHTS_FILE, overwrite=True) agent.test(env, nb_episodes=1, visualize=True)
def create_dqn(model, nb_actions): """Creates and compiles a DQN agent with an Adam optimizer.""" memory = SequentialMemory(limit=100000, window_length=1) policy = MaxBoltzmannQPolicy(tau=10, eps=0.2) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, target_model_update=1e-2, policy=policy, gamma=0.995, batch_size=64) dqn.compile(Adam(lr=5e-4, decay=0.0), metrics=['mae']) return dqn
def build_agent(self, mem_file=None, w_file=None): #Create a dummy env to get size of input/output. #Makes it simpler if we ever choose to update env shapes. env = TradingEnv([], "", []) np.random.seed(314) env.seed(314) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] model = Sequential() model.add( LSTM(5, input_shape=(7, 4), return_sequences=True)) # 4 features + 1 bias term. 5 neurons model.add(Activation('tanh')) model.add(LSTM(4)) model.add(Activation('tanh')) model.add(Dropout(0.2)) model.add(Dense(4)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) #Best activation for BoltzmanPolicy #policy = EpsGreedyQPolicy(eps=EPS_VAL) #Off policy policy = BoltzmannQPolicy() #Off-policy test_policy = MaxBoltzmannQPolicy() #On-policy memory = None if mem_file is None: memory = SequentialMemory( limit=50000, window_length=7) ## returns observations of len (7,) else: (memory, memory.actions, memory.rewards, memory.terminals, memory.observations) = pickle.load(open(mem_file, "rb")) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, gamma=GAMMA_VAL, nb_steps_warmup=100, policy=policy, test_policy=test_policy) dqn.compile("adam", metrics=['mse']) if w_file is not None: model.load_weights(w_file) return dqn, env, memory
def create_dqn(model, history_length): memory = SequentialMemory(limit=500000, window_length=history_length) policy = MaxBoltzmannQPolicy() dqn = DQNAgent( model=model, nb_actions=model.output_shape[1], memory=memory, policy=policy, processor=CustomProcessor(), nb_steps_warmup=512, enable_dueling_network=True, dueling_type='avg', target_model_update=5e2, batch_size=32, ) dqn.compile(Adam(lr=1e-3), metrics=['mae']) return dqn
def create_dqn(model, log_interval=50000, model_name='dqn_agent_checkpoint', file_log_path='./logs/log.txt', tensorboard_path='./logs/tensorboard/'): model_path = './models/' + model_name + '.h5' file_logger = FileLogger(file_log_path, interval=log_interval) checkpoint = ModelIntervalCheckpoint(model_path, interval=log_interval) tensorboard = TensorboardLogger(tensorboard_path) callbacks = [file_logger, checkpoint, tensorboard] # Use 4 last observation - history_length = 4 memory = SequentialMemory(limit=500000, window_length=history_length) # Use combine of BoltzmannQPolicy and EpsGreedyQPolicy policy = MaxBoltzmannQPolicy() # Set epsilon to 1.0 and decrease it over every step to stop taking random action when map is explored policy = LinearAnnealedPolicy(inner_policy=policy, attr='eps', value_max=1.0, value_min=0.1, value_test=0.04, nb_steps=NUMBER_OF_STEPS) # Create an instance of DQNAgent from keras-rl dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory, policy=policy, processor=CustomProcessor(), nb_steps_warmup=512, enable_dueling_network=True, dueling_type='avg', target_model_update=5e2, batch_size=32) dqn.compile(Adam(lr=5e-4), metrics=['mae']) return dqn, callbacks
map = Dense(map_size * 2)(map) map = Activation('tanh')(map) map = Dense(map_size)(map) map = Activation('tanh')(map) merged = Concatenate()([map, position]) merged = Dense(nb_neuron_input * 2, activation='tanh')(merged) merged = Dense(nb_neuron_input, activation='tanh')(merged) merged = Dense(nb_neuron_output, activation='softmax')(merged) model = Model(inputs=[inputs], outputs=[merged]) model.summary() model.compile(Adam(), loss='mean_squared_error') memory = SequentialMemory(limit=50000, window_length=1) policy = MaxBoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_neuron_output, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae', 'accuracy']) metrics = Metrics(dqn, env) #fileName = '1D_advanced_Sequential1000_BoltzmannQ_10000steps(7)' #fileName = '1D_advanced_Sequential1000_EpsGreedyQ_10000steps(7)' #fileName = '1D_advanced_Sequential1000_MaxBoltzmannQ_10000steps(7)' #fileName = '1D_advanced_Sequential50000_BoltzmannQPolicy_10000steps(7)' #fileName = '1D_advanced_Sequential50000_MaxBoltzmannQ_1000000steps(0)' fileName = '1D__Sequential50000_BoltzmannQ_1000000steps(0)'
model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('softmax')) print(model.summary()) memory = SequentialMemory(limit=100000, window_length=1) if args.mode == 'train': policy = LinearAnnealedPolicy(EpsStochasticPolicy(), attr='eps', value_max=1., value_min=.25, value_test=.05, nb_steps=20000) else: policy = EpsStochasticPolicy(eps=.25) test_policy = MaxBoltzmannQPolicy(eps=0.1) # dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=200, # enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy, test_policy=test_policy) dqn.compile(Adam(lr=0.0003, ), metrics=['mae']) if args.mode == 'resume':
def create_model(self): # Simple model where only one layer feeds into the next self._model = Sequential() # Get initializer for hidden layers init = tf.keras.initializers.RandomNormal(mean=.1, stddev=.02) # Input Layer; this shape is one that just works self._model.add( Dense(512, input_shape=(1, 7814), activation="relu", use_bias=False, kernel_initializer=init, name='first_hidden')) # Hidden Layers self._model.add( Flatten(name='flatten') ) # Flattening resolve potential issues that would arise otherwise self._model.add( Dense(256, activation="relu", use_bias=False, kernel_initializer=init, name='second_hidden')) # Output Layer self._model.add( Dense(len(self._ACTION_SPACE), use_bias=False, kernel_initializer=init, name='final')) self._model.add( BatchNormalization() ) # Increases speed: https://www.dlology.com/blog/one-simple-trick-to-train-keras-model-faster-with-batch-normalization/ self._model.add( Activation("linear") ) # Same as passing activation in Dense Layer, but allows us to access last layer: https://stackoverflow.com/questions/40866124/difference-between-dense-and-activation-layer-in-keras # This is how many battles we'll remember before we start forgetting old ones self._memory = SequentialMemory(limit=max(num_battles, 10000), window_length=1) # Simple epsilon greedy policy # This takes the output of our NeuralNet and converts it to a value # Softmax is another probabilistic option: https://github.com/keras-rl/keras-rl/blob/master/rl/policy.py#L120 self._policy = LinearAnnealedPolicy( MaxBoltzmannQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=num_battles, ) # Defining our DQN self._dqn = DQNAgent( model=self._model, nb_actions=len(action_space), policy=self._policy, memory=self._memory, nb_steps_warmup=max( 1000, int(num_battles / 10) ), # The number of battles we go through before we start training: https://hub.packtpub.com/build-reinforcement-learning-agent-in-keras-tutorial/ gamma= 0.8, # This is the discount factor for the Value we learn - we care a lot about future rewards target_model_update= .01, # This controls how much/when our model updates: https://github.com/keras-rl/keras-rl/issues/55 delta_clip= 1, # Helps define Huber loss - cips values to be -1 < x < 1. https://srome.github.io/A-Tour-Of-Gotchas-When-Implementing-Deep-Q-Networks-With-Keras-And-OpenAi-Gym/ enable_double_dqn=True, ) self._dqn.compile(Adam(lr=0.01), metrics=["mae"])
model.add(Dense(128, activation="elu")) model.add(Dropout(DROP)) model.add(Dense(nb_actions, activation="linear")) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(MaxBoltzmannQPolicy(), attr='eps', value_max=1., value_min=.05, value_test=.001, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! # policy = LinearAnnealedPolicy(BoltzmannQPolicy(), attr='tau', value_max=10., value_min=.1, value_test=.05, nb_steps=1000000) dqn = DQNAgent(model=model, nb_actions=nb_actions,