def test_single_dqn_input(): model = Sequential() model.add(Flatten(input_shape=(2, 3))) model.add(Dense(2)) memory = SequentialMemory(limit=10, window_length=2) for double_dqn in (True, False): agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, enable_double_dqn=double_dqn) agent.compile('sgd') agent.fit(MultiInputTestEnv((3,)), nb_steps=10)
class DQN(BaseAgent): def __init__(self, model, processor, policy, test_policy, num_actions): # Replay memory memory = SequentialMemory(limit=opt.dqn_replay_memory_size, window_length=opt.dqn_window_length) self.agent = DQNAgent(model=model, nb_actions=num_actions, policy=policy, test_policy=test_policy, memory=memory, processor=processor, batch_size=opt.dqn_batch_size, nb_steps_warmup=opt.dqn_nb_steps_warmup, gamma=opt.dqn_gamma, target_model_update=opt.dqn_target_model_update, enable_double_dqn=opt.enable_double_dqn, enable_dueling_network=opt.enable_dueling_network, train_interval=opt.dqn_train_interval, delta_clip=opt.dqn_delta_clip) self.agent.compile(optimizer=keras.optimizers.Adam(lr=opt.dqn_learning_rate), metrics=['mae']) def fit(self, env, num_steps, weights_path=None, visualize=False): callbacks = [] if weights_path is not None: callbacks += [ModelIntervalCheckpoint(weights_path, interval=50000, verbose=1)] self.agent.fit(env=env, nb_steps=num_steps, action_repetition=opt.dqn_action_repetition, callbacks=callbacks, log_interval=opt.log_interval, test_interval=opt.test_interval, test_nb_episodes=opt.test_nb_episodes, test_action_repetition=opt.dqn_action_repetition, visualize=visualize, test_visualize=visualize, verbose=1) def test(self, env, num_episodes, visualize=False): self.agent.test(env=env, nb_episodes=num_episodes, action_repetition=opt.dqn_action_repetition, verbose=2, visualize=visualize) def save(self, out_dir): self.agent.save_weights(out_dir, overwrite=True) def load(self, out_dir): self.agent.load_weights(out_dir)
def test_multi_dqn_input(): input1 = Input(shape=(2, 3)) input2 = Input(shape=(2, 4)) x = Concatenate()([input1, input2]) x = Flatten()(x) x = Dense(2)(x) model = Model(inputs=[input1, input2], outputs=x) memory = SequentialMemory(limit=10, window_length=2) processor = MultiInputProcessor(nb_inputs=2) for double_dqn in (True, False): agent = DQNAgent(model, memory=memory, nb_actions=2, nb_steps_warmup=5, batch_size=4, processor=processor, enable_double_dqn=double_dqn) agent.compile('sgd') agent.fit(MultiInputTestEnv([(3,), (4,)]), nb_steps=10)
def main(): np.random.seed(123) env = PentagoEnv(SIZE) env.seed(123) nb_actions = env.action_space.n model = Sequential() #model.add(Reshape((SIZE ** 2,), input_shape=(SIZE, SIZE))) model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(64, activation='relu')) model.add(Dense(128, activation='sigmoid')) model.add(Dense(nb_actions)) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=5000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy) optimizer=RMSprop(lr=0.00025, epsilon=0.01) dqn.compile(optimizer) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=1) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
def main(): # Create env np.random.seed(SEED) env = PentagoEnv(SIZE, agent_starts = AGENT_STARTS) env.seed(SEED) nb_actions = env.action_space.n # Define model model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(64, activation='relu')) model.add(Dense(128, activation='sigmoid')) model.add(Dense(nb_actions)) print(model.summary()) # Configure and compile agent memory = SequentialMemory(limit=5000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1000, policy=policy) optimizer=RMSprop(lr=0.00025, epsilon=0.01) dqn.compile(optimizer) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=1) # After training is done, we save the final weights. dqn.save_weights('weights/dqn-{}-weights-{}.h5f'.format(TAG, datetime.datetime.now()))
def __init__(self, model, processor, policy, test_policy, num_actions): # Replay memory memory = SequentialMemory(limit=opt.dqn_replay_memory_size, window_length=opt.dqn_window_length) self.agent = DQNAgent(model=model, nb_actions=num_actions, policy=policy, test_policy=test_policy, memory=memory, processor=processor, batch_size=opt.dqn_batch_size, nb_steps_warmup=opt.dqn_nb_steps_warmup, gamma=opt.dqn_gamma, target_model_update=opt.dqn_target_model_update, enable_double_dqn=opt.enable_double_dqn, enable_dueling_network=opt.enable_dueling_network, train_interval=opt.dqn_train_interval, delta_clip=opt.dqn_delta_clip) self.agent.compile(optimizer=keras.optimizers.Adam(lr=opt.dqn_learning_rate), metrics=['mae'])
def train_dqn_model(layers, rounds=10000, run_test=False, use_score=False): ENV_NAME = 'malware-score-v0' if use_score else 'malware-v0' env = gym.make(ENV_NAME) env.seed(123) nb_actions = env.action_space.n window_length = 1 # "experience" consists of where we were, where we are now # generate a policy model model = generate_dense_model((window_length,) + env.observation_space.shape, layers, nb_actions) # configure and compile our agent # BoltzmannQPolicy selects an action stochastically with a probability generated by soft-maxing Q values policy = BoltzmannQPolicy() # memory can help a model during training # for this, we only consider a single malware sample (window_length=1) for each "experience" memory = SequentialMemory(limit=32, ignore_episode_boundaries=False, window_length=window_length) # DQN agent as described in Mnih (2013) and Mnih (2015). # http://arxiv.org/pdf/1312.5602.pdf # http://arxiv.org/abs/1509.06461 agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=16, enable_double_dqn=True, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy, batch_size=16) # keras-rl allows one to use and built-in keras optimizer agent.compile(RMSprop(lr=1e-3), metrics=['mae']) # play the game. learn something! agent.fit(env, nb_steps=rounds, visualize=False, verbose=2) history_train = env.history history_test = None if run_test: # Set up the testing environment TEST_NAME = 'malware-score-test-v0' if use_score else 'malware-test-v0' test_env = gym.make(TEST_NAME) # evaluate the agent on a few episodes, drawing randomly from the test samples agent.test(test_env, nb_episodes=100, visualize=False) history_test = test_env.history return agent, model, history_train, history_test
# Output layer model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=NUM_STEPS, window_length=1) # train_policy = BoltzmannQPolicy(tau=0.05) train_policy = EpsGreedyQPolicy() test_policy = GreedyQPolicy() if DUEL_DQN: dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/duel_dqn_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) else: dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/dqn_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Optionally, we can reload a previous model's weights and continue training from there # FILENAME = 'weights/duel_dqn_variable_pendulum-v0_weights_4096_4_50000_2017-07-11_140316.h5f' # Load the model weights
class DQNSecretary: # 重み保存先 weightdir = './data' weightfile = './data/dqn_{}_weights.h5' # モデルの初期化 def __init__(self, n=100, recycle=True): print('モデルを作成します。') self.train_interval_logger = None # Get the environment and extract the number of actions. self.env = Secretary(n=n) self.env_name = 'secretary' self.weightfile = self.__class__.weightfile.format(self.env_name) self.nb_actions = self.env.action_space.n # Next, we build a very simple model. self.model = Sequential() self.model.add( Flatten(input_shape=(1, ) + self.env.observation_space.shape)) self.model.add(Dense(256)) self.model.add(Activation('relu')) self.model.add(Dense(256)) self.model.add(Activation('relu')) self.model.add(Dense(256)) self.model.add(Activation('relu')) self.model.add(Dense(self.nb_actions)) self.model.add(Activation('linear')) #print(self.model.summary()) # Finally, we configure and compile our agent. # You can use every built-in Keras optimizer and even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy(tau=1.) self.dqn = DQNAgent(model=self.model, nb_actions=self.nb_actions, memory=memory, nb_steps_warmup=1000, target_model_update=1e-2, policy=policy) self.dqn.compile(Adam(lr=1e-3), metrics=[]) self.__istrained = False print('モデルを作成しました。') if recycle: if exists(self.weightfile): try: print('訓練済み重みを読み込みます。') self.dqn.load_weights(self.weightfile) self.__istrained = True print('訓練済み重みを読み込みました。') return None except: print('訓練済み重みの読み込み中にエラーが発生しました。') print('Unexpected error:', exc_info()[0]) raise else: print('訓練済み重みが存在しません。訓練を行ってください。') # 訓練 def train(self, nb_steps=30000, verbose=1, visualize=False, log_interval=3000): if self.__istrained: raise RuntimeError('このモデルは既に訓練済みです。') print('訓練を行うので、お待ちください。') # 訓練実施 # Okay, now it's time to learn something! # We visualize the training here for show, but this slows down training quite a lot. # You can always safely abort the training prematurely using Ctrl + C. callbacks = [] if verbose == 1: self.train_interval_logger = TrainIntervalLogger2( interval=log_interval) callbacks.append(self.train_interval_logger) verbose = 0 elif verbose > 1: callbacks.append(TrainEpisodeLogger()) verbose = 0 hist = self.dqn.fit(self.env, nb_steps=nb_steps, callbacks=callbacks, verbose=verbose, visualize=visualize, log_interval=log_interval) self.__istrained = True if self.train_interval_logger is not None: # 訓練状況の可視化 interval = self.train_interval_logger.records['interval'] episode_reward = self.train_interval_logger.records[ 'episode_reward'] mean_q = self.train_interval_logger.records['mean_q'] if len(interval) > len(mean_q): mean_q = np.pad(mean_q, [len(interval) - len(mean_q), 0], "constant") plt.figure() plt.plot(interval, episode_reward, marker='.', label='報酬') plt.plot(interval, mean_q, marker='.', label='Q値') plt.legend(loc='best', fontsize=10) plt.grid() plt.xlabel('interval') plt.ylabel('score') plt.title('訓練状況') plt.xticks( np.arange(min(interval), max(interval) + 1, (max(interval) - min(interval)) // 7)) plt.show() # 重みの保存 if not exists(self.__class__.weightdir): try: mkdir(self.__class__.weightdir) except: print('重み保存フォルダの作成中にエラーが発生しました。') print('Unexpected error:', exc_info()[0]) raise try: # After training is done, we save the final weights. self.dqn.save_weights(self.weightfile, overwrite=True) except: print('重みの保存中にエラーが発生しました。') print('Unexpected error:', exc_info()[0]) raise return hist # テスト def test(self, nb_episodes=10, visualize=True, verbose=1): # Finally, evaluate our algorithm for 5 episodes. hist = self.dqn.test(self.env, nb_episodes=nb_episodes, verbose=verbose, visualize=visualize) return hist
attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! save_dir = "./saved_model/" + args.rl_agent if not os.path.exists(save_dir): os.makedirs(save_dir) weights_filename = 'dqn_{}_weights.h5f'.format(env_name_ram) checkpoint_weights_filename = 'dqn_' + env_name_ram + '_weights_{step}.h5f'
env = MastermindEnv() np.random.seed(123) env.seed(123) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile( Adam(lr=1e-3), metrics=['mae'] ) #There is an error here, which is not letting me to import this module both in Colab and local Jupyter dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)
attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=10000, ) #loaded_model = tf.keras.models.load_model('model_20000') #model.load_weights('weights_DQN.h5') # Defining our DQN dqn = DQNAgent( model=model, nb_actions=len(env_player.action_space), policy=policy, memory=memory, nb_steps_warmup=1000, gamma=0.5, target_model_update=1, delta_clip=0.01, enable_double_dqn=True, ) dqn.compile(Adam(lr=0.00025), metrics=["mae"]) # Training env_player.play_against( env_algorithm=dqn_training, opponent=opponent, env_algorithm_kwargs={ "dqn": dqn, "nb_steps": NB_TRAINING_STEPS
value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(learning_rate=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in tensorflow.keras callbacks! weights_filename = f'dqn_{args.env_name}_weights.h5f' checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = f'dqn_{args.env_name}_log.json' callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ]
nb_actions = env.action_space.n input_shape = (WINDOW_LENGTH, ) + INPUT_SHAPE model = Sequential() model.add(Permute((2, 3, 1), input_shape=input_shape)) model.add(Convolution2D(32, (8, 8), strides=(4, 4), activation='relu')) model.add(Convolution2D(64, (4, 4), strides=(2, 2), activation='relu')) model.add(Convolution2D(64, (3, 3), strides=(1, 1), activation='relu')) model.add(Flatten()) model.add(Dense(512, activation='relu')) model.add(Dense(nb_actions, activation='relu')) ''' policy = LinearAnnealedPolicy(GreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=10000) ''' memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=GreedyQPolicy(), memory=memory, processor=processor, nb_steps_warmup=500, gamma=.99, target_model_update=1e-2, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) dqn.load_weights(weights_filename) dqn.test(env, nb_episodes=10, visualize=True)
import gym_ctc_marketmaker #env = gym.make("ctc-marketmaker-v0") env.setOrderbook(orderbook) #model = loadModel(name='model-sell-artificial-2') model = loadModel(name='model-sell-artificial-sine') model = createModel() nrTrain = 100000 nrTest = 10 policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=5000, window_length=1) # nb_steps_warmup: the default value for that in the DQN OpenAI baselines implementation is 1000 dqn = DQNAgent(model=model, nb_actions=len(env.levels), memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # cbs_train = [] # cbs_train = [LivePlotCallback(nb_episodes=20000, avgwindow=20)] # dqn.fit(env, nb_steps=nrTrain, visualize=True, verbose=2, callbacks=cbs_train) # saveModel(model=model, name='model-sell-artificial-sine') cbs_train = [] cbs_test = [] cbs_test = [ActionPlotCallback(nb_episodes=nrTest)] dqn.test(env, nb_episodes=nrTest, visualize=True,
buttons = NoisyNetDense(nb_actions, activation='linear')(dense) model = Model(inputs=frame,outputs=buttons) print(model.summary()) memory = PrioritizedMemory(limit=1000000, alpha=.6, start_beta=.4, end_beta=1., steps_annealed=30000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() #This is the important difference. Rather than using an E Greedy approach, where #we keep the network consistent but randomize the way we interpret its predictions, #in NoisyNet we are adding noise to the network and simply choosing the best value. policy = GreedyQPolicy() #N-step loss with n of 3 dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1., n_step=3) #Prioritized Memories typically use lower learning rates dqn.compile(Adam(lr=.00025/4), metrics=['mae']) folder_path = '../model_saves/NoisyNstepPDD/' if args.mode == 'train': dqn.load_weights(folder_path + 'noisynet_pdd_dqn_MsPacmanDeterministic-v4_weights_10000000.h5f') weights_filename = folder_path + 'final_noisynet_nstep_pdd_dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = folder_path + 'final_noisynet_nstep_pdd_dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = folder_path + 'final_noisynet_nstep_pdd_dqn_' + args.env_name + '_REWARD_DATA.txt' callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=5000000)] callbacks += [TrainEpisodeLogger(log_filename)] dqn.fit(env, callbacks=callbacks, nb_steps=30000000, verbose=0, nb_max_episode_steps=20000)
model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) #print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, enable_dueling_network=True) dqn.compile(Adam(lr=1e-3), metrics=['mae']) dqn.test(env, nb_episodes=5, visualize=True) for i in range(3, 0, -1): print('Sentient agent in', i) sleep(1) fname = f'dqn_{ENV_NAME}_weights.h5f'.lower() if not os.path.isfile(fname): fname = f'dqn_{ENV_NAME}_weights_20190321.h5f'.lower()
def main(): # Get the environment and extract the number of actions. print("Using environment", environment_name) environment = gym.make(environment_name) environment = DiscreteWrapper(environment) np.random.seed(666) nb_actions = environment.action_space.n # Build the model. model = build_model((WINDOW_LENGTH, ) + INPUT_SHAPE, nb_actions) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH) processor = DuckieTownProcessor() # Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, #nb_steps=1000000 nb_steps=400000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(optimizers.Adam(lr=.00025), metrics=['mae']) weights_filename = 'dqn_{}_weights.h5f'.format(environment_name) # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that now you can use the built-in Keras callbacks! checkpoint_weights_filename = 'dqn_' + environment_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(environment_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000) ] callbacks += [TensorboardCallback()] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit( environment, callbacks=callbacks, #nb_steps=1750000, nb_steps=500000, log_interval=10000, visualize="visualize" in sys.argv) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True) # Finally, evaluate our algorithm for 10 episodes. dqn.test(environment, nb_episodes=10, visualize=False)
model.add(Dense(100, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(100, activation='relu')) model.add(Dense(nb_actions, activation='linear')) print("Model Summary: ", model.summary()) memory = SequentialMemory(limit=100000, window_length=1) #100000 # Boltzmann Q Policy policy = EpsGreedyQPolicy() # DQN Agent: Finally, we configure and compile our agent. You can use every built-in Keras optimizer and even the metrics! dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1e4, target_model_update=1e3, policy=policy, enable_double_dqn=True, batch_size=64) # nb_steps_warmup >= nb_steps 2000 # DQN stores the experience in the memory buffer for the first nb_steps_warmup. This is done to get the required size of batch during experience replay. # When number of steps exceeds nb_steps_warmup then the neural network would learn and update the weight. # Neural Compilation dqn.compile(Adam(lr=1e-4), metrics=['mae']) callbacks = [ModelCheckpoint('dqn_ea_weights.h5f', 1620)] # Fit the model: training for nb_steps = number of generations dqn.fit(env, callbacks=callbacks, nb_steps=162e30,
# 下一步,我们创建一个简单的单隐层神经网络模型。 model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # 接下来,配置并编译我们的代理端。我们将策略设成ε-贪心算法,并且将存储设置成顺序存储方式因为我们想要存储执行操作的结果和每一操作得到的奖励。 print 'preparing' policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory,nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) print 'fitting' dqn.fit(env, nb_steps=5000, visualize=True, verbose=2) # 现在测试强化学习模型 print 'dtesting' dqn.test(env, nb_episodes=50, visualize=True) print 'done'
model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) #print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=300000, window_length=1) policy = EpsGreedyQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # After training is done, we save the final weights. dqn.load_weights('dqn_{}_weights_model4.h5f'.format(ENV_NAME)) # Redirect stdout to capture test results old_stdout = sys.stdout sys.stdout = mystdout = io.StringIO() # Evaluate our algorithm for a few episodes. dqn.test(env, nb_episodes=200, visualize=False) # Reset stdout sys.stdout = old_stdout
# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=NUM_STEPS, window_length=1) # train_policy = BoltzmannQPolicy(tau=0.05) train_policy = EpsGreedyQPolicy() test_policy = GreedyQPolicy() # Compile the agent based on method specified. We use .upper() to convert to # upper case for comparison if METHOD.upper() == 'DUEL_DQN': memory = SequentialMemory(limit=NUM_STEPS, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=train_policy, test_policy=test_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'DQN': memory = SequentialMemory(limit=NUM_STEPS, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=train_policy, test_policy=test_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'SARSA': # SARSA does not require a memory. agent = SarsaAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=train_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'CEM':
del self.rewardbuf[0] self.rewards[self.episode] = rw self.avgrewards[self.episode] = np.mean(self.rewardbuf) self.plot() self.episode += 1 def plot(self): self.grphinst.set_ydata(self.rewards) self.grphavg.set_ydata(self.avgrewards) plt.draw() plt.pause(0.01) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy, enable_double_dqn=False) dqn.compile(Adam(lr=0.002, decay=2.25e-05), metrics=['mse']) cbs = [EpsDecayCallback(eps_poilcy=policy, decay_rate=0.975)] cbs += [LivePlotCallback(nb_episodes=4000, avgwindow=20)] dqn.fit(env, nb_steps=1000000, visualize=False, verbose=2, callbacks=cbs) dqn.save_weights('{}/dqn_{}_weights.h5f'.format(outdir, ENV_NAME), overwrite=True) # evaluate the algorithm for 100 episodes. #dqn.test(env, nb_episodes=100, visualize=True)
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add((Dense(HLS))) model.add(Activation('relu')) model.add((Dense(HLS))) model.add(Activation('relu')) model.add((Dense(HLS))) model.add(Activation('relu')) model.add(Dense(n_action)) model.add(Activation('linear')) print model.summary() # How did it go? * ( ' ^')* # Configure the RL agent memory = SequentialMemory(limit=50000) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=n_action, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(optimizer=RMSprop(), metrics=['mae']) # Training dqn.fit(env, nb_steps=10000, verbose=2, visualize=False) # Visualize testing dqn.test(env, nb_episodes=5, visualize=True)
Dense(nb_hidden), Activation("relu"), Dense(nb_hidden), Activation("relu"), Dense(nb_actions), Activation("linear") ]) memory = SequentialMemory(limit=memory_size, window_length=window_length) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=nb_steps_warmup, target_model_update=target_model_update, policy=policy, gamma=gamma, batch_size=batch_size, train_interval=train_interval, enable_double_dqn=True, delta_clip=delta_clip) dqn.compile(Adam(), metrics=["mae"]) # Whether to use existing model with the same hyper-parameters, skip its training and only do testing. if use_previous and logger_file.format( type="train", model_name=model_name).split("/")[-1] in existing_models: print("Model exists, skipping training...\n") dqn.load_weights( model_file_prev.format(model_name=model_name, type="weights"))
policy = LinearAnnealedPolicy( EpsGreedyQPolicy(), attr="eps", value_max=1.0, value_min=0.05, value_test=0, nb_steps=10000, ) # load saved model into DQNAgent class trained_dqn_agent = DQNAgent( model=loaded_model, nb_actions=18, policy=policy, memory=memory, nb_steps_warmup=NB_STEPS_WARMUP, gamma=0.5, target_model_update=1, delta_clip=0.01, enable_double_dqn=True, ) ############################################################################## # set random seeds tf.random.set_seed(0) np.random.seed(0) # This is the function that will be used to train the dqn def dqn_training(player, dqn, nb_steps, filename):
nb_episodes_memory = 1000 try: memory = pickle.load(open("memory.pkl", "rb")) except (FileNotFoundError, EOFError): memory = SequentialMemory(limit=nb_episode_steps * nb_episodes_memory, window_length=1) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! #policy = BoltzmannQPolicy() policy = EpsGreedyQPolicy(eps=0.01) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) try: dqn.load_weights('dqn_{}_weights.h5f'.format(ENV_NAME)) except (OSError): logger.warning("File not found") n = 0 while True: n += 1 logger.info(f'Iteration #{n}') # Run some training
# In[ ]: model.summary() # In[ ]: memory = SequentialMemory(limit=2000, window_length=1) policy = BoltzmannQPolicy(tau=1.) #dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, # target_model_update=1e-2, policy=policy) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=1000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # In[ ]: ENV_NAME = "aftersubmissionv19" weights_filename = 'dqn_{}_weights.h5f'.format(ENV_NAME) checkpoint_weights_filename = 'dqn_' + ENV_NAME + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(ENV_NAME) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=25000)
print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! # train_policy = BoltzmannQPolicy() train_policy = EpsGreedyQPolicy(eps=1.0) test_policy = GreedyQPolicy() # Compile the agent based on method specified. We use .upper() to convert to # upper case for comparison if METHOD.upper() == 'DUEL_DQN': memory = SequentialMemory(limit=NUM_STEPS, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=train_policy, test_policy=test_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'DQN': memory = SequentialMemory(limit=NUM_STEPS, window_length=1) agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=train_policy, test_policy=test_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'SARSA': # SARSA does not require a memory. agent = SarsaAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=train_policy) agent.compile(Adam(lr=1e-3, clipnorm=1.0), metrics=['mae']) elif METHOD.upper() == 'CEM':
processor = AtariProcessor() policy = policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1250000) dqn = DQNAgent(model=model, nb_actions=n_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=False, enable_dueling_network=False, nb_steps_warmup=50000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) # folder_path = './model_saves/Vanilla/' weights_filename = 'dqn_{}_weights.h5f'.format(env_name) checkpoint_weights_filename = 'dqn_' + env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(env_name) callbacks = [ ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)
# Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, window_length=WINDOW_LENGTH, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, delta_range=(-1., 1.), target_model_update=10000, train_interval=4) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] callbacks += [FileLogger(log_filename, interval=100)] import cProfile cProfile.run('dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000)') # dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000)
memory = SequentialMemory(limit=memory_limit, window_length=window_length) #################################################################### policy = EpsGreedyQPolicy(eps=eps) policy = LinearAnnealedPolicy(policy, attr='eps', value_max=eps, value_min=0, value_test = 0, nb_steps=nb_steps-2000) test_policy = GreedyQPolicy() #################################################################### dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=window_length+batch_size, target_model_update=target_model_update, policy=policy, test_policy = test_policy, batch_size=batch_size, train_interval=train_interval, gamma = gamma) dqn.compile(Adam(lr=lr), metrics=['mae']) #################################################################### history = dqn.fit(env, nb_steps=nb_steps, visualize=False, verbose=1, action_repetition=action_repetition) print('\n\n {} \n\n'.format(len(history.history['episode_reward']))) ####################################################################
def main(): env = CarRacing() env.reset() # load json and create model json_file = open('./saved/model2.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = tf.keras.models.model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights("./saved/model2.h5") print("Loaded model from disk") loaded_model.compile(loss='mse', optimizer='adam') model = loaded_model # print("State Space {}".format(env.P[331])) # model = Sequential(name='rvoum') # model.add(Reshape((96,96,3),input_shape = (1,96,96,3))) # model.add(Conv2D(filters=32, kernel_size=(3, 3), strides = 3,activation="relu", input_shape=(1,96,96,3))) # model.add(Flatten()) # model.add(Dense(3,activation="selu")) # #model.add(Embedding(500,6,input_length = 1, name='Embedding')) # model.add(Reshape((3,),name='Reshape')) # model = Sequential() # model.add(Reshape((96,96,3),input_shape = (1,96,96,3))) # model.add(Conv2D(filters=6, kernel_size=(7, 7), strides=3, activation='relu', input_shape=(96, 96, 3))) # model.add(MaxPooling2D(pool_size=(2, 2))) # model.add(Conv2D(filters=12, kernel_size=(4, 4), activation='relu')) # model.add(MaxPooling2D(pool_size=(2, 2))) # model.add(Flatten()) # model.add(Dense(216, activation='relu')) # model.add(Dense(16, activation=None)) # model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0001, epsilon=1e-8)) # model.summary() policy = EpsGreedyQPolicy() memory = SequentialMemory(limit=5000, window_length=1) nb_actions = 16 dqn = DQNAgent(model=model, memory=memory, nb_actions=nb_actions, nb_steps_warmup=10, target_model_update=1e-5, policy=policy) dqn.compile(Adam(lr=0.0001), metrics=['mse']) log_interval = 1e4 for i in range(0): print(i, "\n") dqn.fit(env, nb_steps=3000, log_interval=log_interval, verbose=1, nb_max_episode_steps=1000, action_repetition=3) env.close() model_json = model.to_json() with open("model2.json", "w") as json_file: json_file.write(model_json) model.save_weights("model2.h5") print("Saved model to disk") env.init_test() dqn.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=10000, action_repetition=2) env.close()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = EpsGreedyQPolicy(eps=0.1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=2, nb_max_episode_steps=300) import rl.callbacks class EpisodeLogger(rl.callbacks.Callback): def __init__(self): self.observations = {} self.rewards = {} self.actions = {} def on_episode_begin(self, episode, logs):
motor_parameter=dict(r_a=15e-3, r_e=15e-3, l_a=1e-3, l_e=1e-3), load_parameter=dict(a=0, b=.1, c=.1, j_load=0.04), # Pass a string (with extra parameters) ode_solver='euler', solver_kwargs={}, # Pass a Class with extra parameters reference_generator=WienerProcessReferenceGenerator(reference_state='i', sigma_range=(3e-3, 3e-2)) ) env = FlattenObservation(env) nb_actions = env.action_space.n window_length = 1 model = Sequential() model.add(Flatten(input_shape=(window_length,) + env.observation_space.shape)) model.add(Dense(16, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(4, activation='relu')) model.add(Dense(nb_actions, activation='linear')) memory = SequentialMemory(limit=15000, window_length=window_length) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(eps=0.2), 'eps', 0.2, 0.01, 0, 20000) dqn = DQNAgent( model=model, policy=policy, nb_actions=nb_actions, memory=memory, gamma=0.9, batch_size=128, train_interval=1, memory_interval=1 ) dqn.compile(Adam(lr=1e-4), metrics=['mse']) dqn.fit(env, nb_steps=200000, action_repetition=1, verbose=2, visualize=True, nb_max_episode_steps=50000, log_interval=10000) dqn.test(env, nb_episodes=3, nb_max_episode_steps=50000, visualize=True)
print_stats=False) env = ClassifyEnv(MODE, imb_rate, X_train, y_train) memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=EPS_MAX, value_min=EPS_MIN, value_test=0.05, nb_steps=EPS_STEPS) dqn = DQNAgent(model=model, policy=policy, nb_actions=2, memory=memory, processor=processor, nb_steps_warmup=WARMUP_STEPS, gamma=GAMMA, target_model_update=TARGET_MODEL_UPDATE, train_interval=4, delta_clip=1, batch_size=BATCH_SIZE, enable_double_dqn=DOUBLE_DQN) dqn.compile(Adam(lr=LR)) metrics = Metrics(X_val, y_val) dqn.fit(env, nb_steps=TRAINING_STEPS, log_interval=LOG_INTERVAL, callbacks=[metrics], verbose=0) y_pred = make_predictions(dqn.target_model, X_test) stats = calculate_metrics(y_test, y_pred) # Get stats as dictionairy
# Output layer model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=NUM_STEPS, window_length=1) # train_policy = BoltzmannQPolicy(tau=0.05) train_policy = EpsGreedyQPolicy() test_policy = GreedyQPolicy() if DUEL_DQN: dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/duel_dqn_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) else: dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=train_policy, test_policy=test_policy) filename = 'weights/dqn_{}_weights_{}_{}_{}_{}.h5f'.format(ENV_NAME, LAYER_SIZE, NUM_HIDDEN_LAYERS, NUM_STEPS, TRIAL_ID) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Load the model weights dqn.load_weights(FILENAME)
nb_actions = env.action_space.n model = tf.keras.Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01))) model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.01))) model.add(Dense(nb_actions)) print(model.summary()) memory = SequentialMemory(limit=200000, window_length=1) policy = CustomEpsGreedy(max_eps=0.6, min_eps=0.1, eps_decay=0.9997) agent = DQNAgent( nb_actions=nb_actions, model=model, memory=memory, policy=policy, gamma=0.99, batch_size=64) agent.compile(optimizer=Adam(lr=1e-3), metrics=['mae']) if mode == 'train': #tensorboard_callback = TensorBoard(log_dir="~/tflog/") # early_stopping = EarlyStopping(monitor='episode_reward', patience=0, verbose=1) history = agent.fit(env, nb_steps=200000, visualize=False, nb_max_episode_steps=500, log_interval=500,
np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # create model model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(32)) model.add(Activation('relu')) model.add(Dense(64)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) # configure agent policy = EpsGreedyQPolicy(eps=0.01) memory = SequentialMemory(limit=50000, window_length=1) dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=100, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mse']) # run agent history = dqn.fit(env, nb_steps=50000, visualize=False, verbose=1) plt.plot(history.history['episode_reward']) plt.show()
dense = Flatten()(cv3) dense = Dense(512, activation='relu')(dense) buttons = Dense(nb_actions, activation='linear')(dense) model = Model(inputs=frame,outputs=buttons) print(model.summary()) #PER memory = PrioritizedMemory(limit=1000000, alpha=.6, start_beta=.4, end_beta=1., steps_annealed=10000000, window_length=WINDOW_LENGTH) processor = AtariProcessor() policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1250000) dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, enable_double_dqn=True, enable_dueling_network=True, nb_steps_warmup=1000, gamma=.99, target_model_update=10000, train_interval=4, delta_clip=1.) #Prioritized Memories typically use lower learning rates dqn.compile(Adam(lr=.00025/4), metrics=['mae']) folder_path = './' mode = 'train' if mode == 'train': weights_filename = folder_path + 'pdd_dqn_{}_weights.h5f'.format(env_name) checkpoint_weights_filename = folder_path + 'pdd_dqn_' + env_name + '_weights_{step}.h5f' log_filename = folder_path + 'pdd_dqn_' + env_name + '_REWARD_DATA.txt' callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=500000)] callbacks += [TrainEpisodeLogger()]
nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=input_shape)) model.add(Dense(16, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(nb_actions, activation='linear')) model.summary() memory = SequentialMemory(limit=50000, window_length=window_length) policy = BoltzmannQPolicy() agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) agent.compile(Adam()) import matplotlib.pyplot as plt # fit の結果を取得しておく history = agent.fit(env, nb_steps=10000, visualize=False, verbose=1) # agent.test(env, nb_episodes=5, visualize=True) # 結果を表示 plt.subplot(2, 1, 1) plt.plot(history.history["nb_episode_steps"]) plt.ylabel("step")
model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=True, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=True)
model.add(Dense(16)) # model.add(Dropout(d)) model.add(Activation('relu')) model.add(Dense(nb_actions, activation='linear')) print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=2500, window_length=1) policy = BoltzmannQPolicy() # enable the dueling network # you can specify the dueling_type to one of {'avg','max','naive'} dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=2, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Start from a great state dqn.load_weights('duel_dqn_MountainCar-v0_weights_201806252113.h5f') # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. # dqn.fit(env, nb_steps=10000000, visualize=False, verbose=2) # After training is done, we save the final weights. # dqn.save_weights('duel_dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)
# Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: # policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, window_length=WINDOW_LENGTH, memory=memory, processor=processor, nb_steps_warmup=50000, gamma=.99, delta_range=(-1., 1.), reward_range=(-1., 1.), target_model_update=10000, train_interval=4) dqn.compile(Adam(lr=.00025), metrics=['mae']) if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! weights_filename = 'dqn_{}_weights.h5f'.format(args.env_name) checkpoint_weights_filename = 'dqn_' + args.env_name + '_weights_{step}.h5f' log_filename = 'dqn_{}_log.json'.format(args.env_name) callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)] callbacks += [FileLogger(log_filename, interval=100)] dqn.fit(env, callbacks=callbacks, nb_steps=1750000, log_interval=10000) # After training is done, we save the final weights one more time. dqn.save_weights(weights_filename, overwrite=True)
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, enable_dueling_network=True, dueling_type='avg', target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) dqn.fit(env, nb_steps=1000, visualize=True, verbose=2) dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) dqn.test(env, nb_episodes=5, visualize=True)
from keras.optimizers import Adam import gym from rl.agents.dqn import DQNAgent from rl.policy import EpsGreedyQPolicy from rl.memory import SequentialMemory env = gym.make('MountainCar-v0') nb_actions = env.action_space.n model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) memory = SequentialMemory(limit=30000, window_length=1) policy = EpsGreedyQPolicy(eps=0.001) dqn = DQNAgent(model=model, nb_actions=nb_actions,gamma=0.99, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy) dqn.compile(Adam(lr=1e-3), metrics=['mae']) history = dqn.fit(env, nb_steps=30000, visualize=False, verbose=2) dqn.test(env, nb_episodes=1, visualize=True)
# Select a policy. We use eps-greedy action selection, which means that a random action is selected # with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that # the agent initially explores the environment (high eps) and then gradually sticks to what it knows # (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05 # so that the agent still performs some random actions. This ensures that the agent cannot get stuck. policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=1000000) # The trade-off between exploration and exploitation is difficult and an on-going research topic. # If you want, you can experiment with the parameters or use a different policy. Another popular one # is Boltzmann-style exploration: #policy = BoltzmannQPolicy(tau=1.) # Feel free to give it a try! dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, processor=processor, nb_steps_warmup=20000, gamma=.99, target_model_update=10000, train_interval=1, delta_clip=1.) dqn.compile(Adam(lr=.00025), metrics=['mae']) agents.append(dqn) mdqn = IndieMultiAgent(agents) callback_multi_test={} for id,agent in enumerate(agents): callbacks = [(TestLogger())] history=History() callbacks += [history] callback_multi_test[agent]=callbacks if args.mode == 'train': # Okay, now it's time to learn something! We capture the interrupt exception so that training # can be prematurely aborted. Notice that you can the built-in Keras callbacks! callback_multi={}