def run_sarsa(): global N_NODE_NETWORK env = SnakeGymDiscrete() nb_actions = env.action_space.n # initialize randomness np.random.seed(123) env.seed(123) # create model model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(N_NODE_NETWORK)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) sarsa.save_weights('sarsa_SnakeGymDiscrete_weights.h5f', overwrite=True) sarsa.test(env, nb_episodes=5, visualize=True)
def main(): model = Sequential() model.add(Flatten(input_shape=(1, 7))) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=20, activation='relu')) model.add(Dense(units=6, activation='linear')) logger.info(model.summary()) steps = 1E9 interval = steps // 100 # policy = MyPolicy() policy = BoltzmannQPolicy() agent = SARSAAgent(model=model, nb_actions=6, policy=policy, train_interval=10, nb_steps_warmup=10) adam = Adam() sgd = SGD(lr=1e-3, momentum=0, decay=0, nesterov=False) agent.compile(optimizer=adam, metrics=['mse']) env = MyEnv() agent.fit(env, steps, verbose=2, visualize=True) fp = Path(__file__).resolve().parent / 'sarsa_weights.h5f' agent.save_weights(fp, overwrite=True) logger.info('Done')
def main(): # binance = DataReader() env = BinanceEnv() # binance.get_recent_trades() # env.next_observation() # binance_market = BinanceMarket() # binance_market.long() # time.sleep(3) # binance_market.close_long() # time.sleep(3) # binance_market.short() # time.sleep(3) # binance_market.close_short() # binance_market.update_positions() # print(binance_market.balance) # episodes = 10 # for episode in range(1, episodes + 1): # # At each begining reset the game # state = env.reset() # # set done to False # done = False # # set score to 0 # score = 0 # # while the game is not finished # while not done: # # visualize each step # env.render() # # choose a random action # action = random.randint(0, 5) # # execute the action # n_state, reward, done, info = env.step(action) # # keep track of rewards # score += reward # print('episode {} score {}'.format(episode, score)) model = agent(env.observation_space.shape[0], env.action_space.n) policy = EpsGreedyQPolicy() sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) sarsa.compile('adam', metrics=['mse', 'accuracy']) # sarsa.load_weights('sarsa_weights_bnb_07.h5f') env.is_testing = False sarsa.fit(env, nb_steps=100000, visualize=False, verbose=1) sarsa.save_weights('sarsa_weights_bnb_07_1.h5f', overwrite=True) # sarsa.load_weights('sarsa_weights_bnb_07_1.h5f') # env.simulator = False env.is_testing = True scores = sarsa.test(env, nb_episodes=1, visualize=False) print('Average score over 100 test games:{}'.format(np.mean(scores.history['episode_reward']))) _ = sarsa.test(env, nb_episodes=10, visualize=True) obs = env.reset() for i in range(2000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
# Make a neural net with 3 hidden layers def agent(states, actions): model = Sequential() model.add(Flatten(input_shape=(1, states))) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(actions, activation='linear')) return model # Actually make a neural net with 3 hidden layers model = agent(env.observation_space.shape[0], env.action_space.n) policy = EpsGreedyQPolicy() # Create a tensorflow reinforcement learning agent using the [state > action > reward] system sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) # Choose how we calculate reward and modify the model sarsa.compile('adam', metrics=['mse']) # sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1) sarsa.load_weights('cartpolekerassarsa.h5f') scores = sarsa.test(env, nb_episodes=10, visualize=False) print('Average score over 10 test games: {}'.format( np.mean(scores.history['episode_reward']))) sarsa.save_weights('cartpolekerassarsa.h5f', overwrite=True) sarsa.test(env, nb_episodes=2, visualize=True)
sarsa.compile(Adam(lr=lrn_rate), metrics=['mae']) # setting up callbacks for result collection and realtime visualization of the results through tensorboard tensorboard = TensorBoard(log_dir="logs/{}".format(time())) tpl = TrainEpisodeLogger() # finally perform the training----- visualize=False enables training without visualizing the game which speeds up the training process sarsa.fit(env, nb_steps=nb_steps, visualize=False, verbose=2, callbacks=[tensorboard, tpl], nb_max_episode_steps=nb_max_episode_steps) # save the model weights sarsa.save_weights('sarsa_%d_%s_weights.h5f' % (scale, ENV_NAME), overwrite=True) # save the training results metrics = [] def dict_to_list(dc): re = [] for key in dc: re.append(dc[key]) return re tt = dict_to_list(tpl.rewards_mean) mm = np.array(tt[:-1]) kk = dict_to_list(tpl.metrics_at_end)
class DistopiaSARSA: def __init__(self, env_name='distopia-initial4-v0', in_path=None, out_path=None, terminate_on_fail=False, reconstruct=False): self.ENV_NAME = env_name self.filename = self.ENV_NAME self.init_paths(in_path, out_path) self.init_env(terminate_on_fail) self.init_model(reconstruct) self.compile_agent() def init_paths(self, in_path, out_path): self.in_path = in_path #if self.in_path != None else './' self.out_path = out_path if out_path != None else './' self.log_path = "./logs/{}".format(time.time()) os.mkdir(self.log_path) def init_env(self, terminate_on_fail): self.env = gym.make(self.ENV_NAME) self.env.terminate_on_fail = terminate_on_fail self.env.record_path = "{}/ep_".format(self.log_path) self.env = gym.wrappers.Monitor(self.env, "recording", force=True) np.random.seed(234) self.env.seed(234) self.nb_actions = np.sum(self.env.action_space.nvec) self.num_actions = self.env.NUM_DIRECTIONS self.num_blocks = self.env.NUM_DISTRICTS * self.env.BLOCKS_PER_DISTRICT def init_model(self, reconstruct=False): if self.in_path != None: if reconstruct == True: self.construct_model() else: yaml_file = open( "{}/{}.yaml".format(self.in_path, self.filename), 'r') model_yaml = yaml_file.read() yaml_file.close() self.model = model_from_yaml(model_yaml) self.model.load_weights("{}/{}.h5".format(self.in_path, self.filename)) else: # Next, we build a very simple model. self.construct_model() self.save_model() print(self.model.summary()) def construct_model(self): self.model = Sequential() self.model.add( Flatten(input_shape=(1, ) + self.env.observation_space.shape)) self.model.add(Dense(64)) self.model.add(Activation('relu')) self.model.add(Dense(64)) self.model.add(Activation('relu')) # self.model.add(Dense(16)) # self.model.add(Activation('relu')) self.model.add(Dense(self.nb_actions)) self.model.add(Activation('linear')) def save_model(self): if self.out_path != None: with open(self.filename + ".yaml", 'w+') as yaml_file: yaml_file.write(self.model.to_yaml()) self.model.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME)) def compile_agent(self): # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! processor = DistopiaProcessor(self.num_blocks, self.num_actions) #memory = SequentialMemory(limit=50000, window_length=1) #policy = PatchedBoltzmannQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) #test_policy = PatchedGreedyQPolicy(num_actions = self.num_actions, num_blocks = self.num_blocks) policy = BoltzmannQPolicy() test_policy = GreedyQPolicy() self.sarsa = SARSAAgent(model=self.model, processor=processor, nb_actions=self.nb_actions, nb_steps_warmup=1000, policy=policy, test_policy=test_policy, gamma=0.9) self.sarsa.compile(Adam(lr=1e-3), metrics=['mae']) def train(self, max_steps=100, episodes=100): # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. self.env._max_steps = max_steps #for i in range(episodes): self.env.current_step = 0 n_steps = max_steps * episodes logger = FileLogger( filepath='{}/{}.json'.format(self.out_path, self.ENV_NAME)) self.sarsa.fit(self.env, nb_steps=n_steps, nb_max_episode_steps=max_steps, visualize=False, verbose=1, callbacks=[logger]) #self.env.reset() # After episode is done, we save the final weights. self.sarsa.save_weights('{}/{}.h5'.format(self.out_path, self.ENV_NAME), overwrite=True) def test(self): # Finally, evaluate our algorithm for 5 episodes. self.sarsa.test(self.env, nb_episodes=5, nb_max_start_steps=0, visualize=True)
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(learning_rate=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights(f'sarsa_{ENV_NAME}_weights.h5f', overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True)
def train(): # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) from keras import backend as K K.set_session(sess) # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() # processor_noisy = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=False) # processor_surrogate = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=True) if not SMOOTH: processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=False) processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=True) else: processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=False) processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=True) if REWARD == "normal": sarsa_normal = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa_normal.compile(Adam(lr=1e-3), metrics=['mae']) history_normal = sarsa_normal.fit(env, nb_steps=50000, visualize=False, verbose=2) sarsa_normal.save_weights(os.path.join(LOG_DIR, 'sarsa_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) sarsa_normal.test(env, nb_episodes=10, visualize=False, verbose=2) pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) elif REWARD == "noisy": sarsa_noisy = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy, processor=processor_noisy) sarsa_noisy.compile(Adam(lr=1e-3), metrics=['mae']) history_noisy = sarsa_noisy.fit(env, nb_steps=50000, visualize=False, verbose=2) if not SMOOTH: sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) else: sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) sarsa_noisy.test(env, nb_episodes=10, visualize=False) elif REWARD == "surrogate": sarsa_surrogate = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy, processor=processor_surrogate) sarsa_surrogate.compile(Adam(lr=1e-3), metrics=['mae']) history_surrogate = sarsa_surrogate.fit(env, nb_steps=50000, visualize=False, verbose=2) if not SMOOTH: sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) else: sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) sarsa_surrogate.test(env, nb_episodes=10, visualize=False)
model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=500000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights( 'learning_tools/learning_nn/keras-rl/sarsa_{}_weights.h5f'.format( CONFIG_FILE[7:-4]), overwrite=True) env.close() del env # Finally, evaluate our algorithm for 5 episodes. # sarsa.test(env, nb_episodes=5, visualize=True)
def agent(states, actions): model = Sequential() model.add(Flatten(input_shape=(1, states))) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(actions, activation='linear')) return model model = agent(env.observation_space.shape[0], env.action_space.n) from rl.agents import SARSAAgent from rl.policy import EpsGreedyQPolicy policy = EpsGreedyQPolicy() sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) sarsa.compile('adam', metrics=['mse']) sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1) scores = sarsa.test(env, nb_episodes=100, visualize=False) print('Average score over 100 test games:{}'.format( np.mean(scores.history['episode_reward']))) sarsa.save_weights('sarsa_weights.h5f', overwrite=True) _ = sarsa.test(env, nb_episodes=2, visualize=True) env.close()
class DQN: def __init__( self, env="CartPole-v1", emulateOculus=True, visualize=True, teachingFilesPath=None, policyValues={ "inner_policy": EpsGreedyQPolicy(), "attr": "eps", "value_max": 0.75, "value_min": .01, "value_test": .0, "nb_steps": 50000 }, dobotEmulation=False): self.policyValues = policyValues os.environ[ "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' physical_devices = tf.config.experimental.list_physical_devices('GPU') print("physical_devices-------------", len(physical_devices)) tf.config.experimental.set_memory_growth(physical_devices[0], True) self.episodeLength = 25 if env == "CartPole-v1": self.env = gym.make('CartPole-v1') self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.n self.saveFileName = 'sarsa_weights.h5f' logdir = "logs/CartPoleV1/" + datetime.now().strftime( "%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True elif env == "Dobot": self.env = dobotGym.dobotGym(emulateOculus=emulateOculus, episodeLength=self.episodeLength, visualize=visualize, teachingFilesPath=teachingFilesPath, dobotEmulation=dobotEmulation) self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.shape[0] self.saveFileName = 'sarsa_weights_dobot.h5f' logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True else: raise TypeError("Wrong env") print( 'States', self.states ) # To get an idea about the number of variables affecting the environment print( 'Actions', self.actions ) # To get an idea about the number of possible actions in the environment, do [right,left] # # episodes = 10 # for episode in range(1, episodes + 1): # # At each begining reset the game # state = self.env.reset() # # set done to False # done = False # # set score to 0 # score = 0 # # while the game is not finished # while not done: # # visualize each step # self.env.render() # # choose a random action # action = random.choice([0, 1]) # # execute the action # n_state, reward, done, info = self.env.step(action) # # keep track of rewards # score += reward # print('episode {} score {}'.format(episode, score)) # not working :( # self.agent = self.agentDDP(self.states, self.actions) # self.agent = self.NAFAgent(self.states, self.actions) # self.policy = EpsGreedyQPolicy() self.savingFreq = 100 self.actualSaving = 0 self.model = self.agentSarsa(self.states, self.actions) self.policy = LinearAnnealedPolicy( inner_policy=self.policyValues["inner_policy"], attr=self.policyValues["attr"], value_max=self.policyValues["value_max"], value_min=self.policyValues["value_min"], value_test=self.policyValues["value_test"], nb_steps=self.policyValues["nb_steps"]) self.agent = SARSAAgent(model=self.model, policy=self.policy, nb_actions=self.actions) self.agent._is_graph_network = True def t(): return False self.agent._in_multi_worker_mode = t self.agent.save = self.saveAgentWeights def lenmeh(): return self.actions # self.agent.__len__ = lenmeh def saveAgentWeights(self, path, overwrite=True): if self.actualSaving < self.savingFreq: self.actualSaving += 1 return None else: self.actualSaving = 0 path = 'model/checkpoint/' + datetime.now().strftime( "%Y%m%d-%H%M%S") + self.saveFileName self.agent.save_weights(path, overwrite) def agentSarsa(self, states, actions): self.model = Sequential() self.model.add(LSTM(42, activation='sigmoid', input_shape=(1, states))) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(24, activation='sigmoid')) self.model.add(Dense(12, activation='sigmoid')) self.model.add(Dense(actions, activation='linear')) self.path = fileOperation.saveToFolder(self.model.to_json(), name='modelShape', folder="model\\checkpoint") # , stateful=False states are resetted together after each batch. # model.add(Flatten(input_shape=(1, states))) # dot_img_file = '/model_1.png' # keras.utils.plot_model(self.model, to_file=dot_img_file, show_shapes=True) # model.reset_states() return self.model def load(self): path = fileOperation.openDialogFunction(".h5f") self.agent.compile('adam', metrics=['mse']) self.agent.load_weights(path) self.agent.compile('adam', metrics=['mse']) def test(self, nb_episodes=2): _ = self.agent.test(self.env, nb_episodes=nb_episodes, visualize=self.visualize) def fit(self, visualize=False): checkpoint_filepath = 'model/checkpoint/' model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath, save_weights_only=False, save_freq=25) self.agent.compile('adam', metrics=['mse']) self.agent.fit( self.env, nb_steps=self.policyValues["nb_steps"], log_interval=self.episodeLength, visualize=visualize, verbose=1, nb_max_start_steps=1, start_step_policy=self.model.reset_states, # callbacks=[PlotLossesKeras()]) callbacks=[self.tensorboard_callback, model_checkpoint_callback], ) scores = self.agent.test(self.env, nb_episodes=5, visualize=visualize) print('Average score over 5 test games:{}'.format( np.mean(scores.history['episode_reward'])))
verbose=2, nb_max_episode_steps=500, callbacks=[tb]) # 20s episodes # print history print("history contents : ", hist.history.keys()) # episode_reward, nb_episode_steps, nb_steps # summarize history for accuracy import matplotlib.pyplot as plt plt.plot(hist.history['episode_reward']) plt.plot(hist.history['nb_episode_steps']) plt.title('learning') plt.xlabel('episode') plt.legend(['episode_reward', 'nb_episode_steps'], loc='upper left') plt.show() # save history with open('_experiments/history_' + filename + '.pickle', 'wb') as handle: pickle.dump(hist.history, handle, protocol=pickle.HIGHEST_PROTOCOL) # After training is done, we save the final weights. sarsa.save_weights('h5f_files/dqn_{}_weights.h5f'.format(filename), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=500) if mode == 'test': sarsa.load_weights('h5f_files/dqn_{}_weights.h5f'.format(filename)) sarsa.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=400) # 40 seconds episodes
nb_actions = env.action_space.n # Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True)
# calculate agent reward based on forward model loss r_intr = (fwd_loss[0]**0.5) / 100 # print "r_intr = %d" % r_intr # TODO: could use a ratio for intrinsic vs environment reward reward = r_intr # + env_reward # print "reward = %f" % reward # apply reward action = agent.backward(reward, done) if done: inverse_model.save_weights(inv_weights_fname, overwrite=True) forward_model.save_weights(fwd_weights_fname, overwrite=True) agent.save_weights(agent_weights_fname, overwrite=True) break env.close() exit() #=========================================================================# # NOTES: # x start with random actions + test observations # x train/test inverse model # x train/test forward model # x calculate agent reward (based on forward model) # x change agent to sarsa # x add forward pass # x add backward pass # x save/load weights
model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(16)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. sarsa.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=10, visualize=True)
kernel_initializer=weight_initializer)(hiddenLayer) outputLayer = Dense(nb_actions, activation='linear')(hiddenLayer) model = Model(inputLayer, outputLayer) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) if loadFromExisting: sarsa.load_weights(file_path) else: startTime = time.time() sarsa.fit(env, nb_steps=nSteps, visualize=True, verbose=1) endTime = time.time() sarsa.save_weights(file_path, overwrite=True) # After training is done, we save the final weights. # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True) if not loadFromExisting: print("Time taken to trian: {0}".format(endTime - startTime))
policy=policy, test_policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Load weights try: #dqn.load_weights(weights_filename) sarsa.load_weights(weights_filename) except OSError: print("no saved weights found") # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. #dqn.fit(env, nb_steps=5000000, visualize=False, verbose=2) sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1, callbacks=[WandbCallback()]) # After training is done, we save the final weights. #dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. #dqn.test(env, nb_episodes=5, visualize=True) sarsa.test(env, nb_episodes=5, visualize=True) # Save weights #dqn.save_weights(weights_filename, overwrite=True) sarsa.save_weights(weights_filename, overwrite=True)