# Make a neural net with 3 hidden layers def agent(states, actions): model = Sequential() model.add(Flatten(input_shape=(1, states))) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(actions, activation='linear')) return model # Actually make a neural net with 3 hidden layers model = agent(env.observation_space.shape[0], env.action_space.n) policy = EpsGreedyQPolicy() # Create a tensorflow reinforcement learning agent using the [state > action > reward] system sarsa = SARSAAgent(model=model, policy=policy, nb_actions=env.action_space.n) # Choose how we calculate reward and modify the model sarsa.compile('adam', metrics=['mse']) # sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1) sarsa.load_weights('cartpolekerassarsa.h5f') scores = sarsa.test(env, nb_episodes=10, visualize=False) print('Average score over 10 test games: {}'.format( np.mean(scores.history['episode_reward']))) sarsa.save_weights('cartpolekerassarsa.h5f', overwrite=True) sarsa.test(env, nb_episodes=2, visualize=True)
kernel_initializer=weight_initializer)(hiddenLayer) outputLayer = Dense(nb_actions, activation='linear')(hiddenLayer) model = Model(inputLayer, outputLayer) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) if loadFromExisting: sarsa.load_weights(file_path) else: startTime = time.time() sarsa.fit(env, nb_steps=nSteps, visualize=True, verbose=1) endTime = time.time() sarsa.save_weights(file_path, overwrite=True) # After training is done, we save the final weights. # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True) if not loadFromExisting: print("Time taken to trian: {0}".format(endTime - startTime))
env = gym.make('CartPole-v1') states = env.observation_space.shape[0] actions = env.action_space.n def agent(states, actions): model = Sequential() model.add(Flatten(input_shape=(1, states))) model.add(Dense(24, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(actions, activation='linear')) return model model = agent(env.observation_space.shape[0], env.action_space.n) from rl.agents import SARSAAgent from rl.policy import EpsGreedyQPolicy sarsa = SARSAAgent(model=model, policy=EpsGreedyQPolicy(), nb_actions=env.action_space.n) # sarsa.compile('adam', metrics = ['mse']) # sarsa.fit(env, nb_steps = 50000, visualize = False, verbose = 1) # scores = sarsa.test(env, nb_episodes = 100, visualize= True) # sarsa.save_weights('1-sarsa_weights.h5f', overwrite=True) sarsa.load_weights('1-sarsa_weights.h5f') _ = sarsa.test(env, nb_episodes=100, visualize=True) print('Average score over 100 test games:{}'.format( np.mean(_.history['episode_reward'])))
for key in dc: re.append(dc[key]) return re tt = dict_to_list(tpl.rewards_mean) mm = np.array(tt[:-1]) kk = dict_to_list(tpl.metrics_at_end) jj = np.array(kk[:-1]) metrics = np.column_stack((mm, jj)) import pickle pickle.dump(metrics, open('sarsa_%d_%s_metrics.p' % (scale, ENV_NAME), "wb")) # load model for testing sarsa.load_weights('/home/am/Desktop/set_tests/final/sarsa_%d_%s_weights.h5f' % (scale, ENV_NAME)) # setting up monitoring tools to record the testing episodes from gym import monitoring from gym.wrappers import Monitor def episode5(episode_id): if episode_id < 5: return True else: return False #rec = StatsRecorder(env,"sarsa_1") #rec.capture_frame()
# Next, we build a very simple model. model = Sequential() model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) model.add(Dense(8)) model.add(Activation('relu')) model.add(Dense(8)) model.add(Activation('relu')) model.add(Dense(8)) model.add(Activation('relu')) model.add(Dense(nb_actions)) model.add(Activation('linear')) print(model.summary()) # SARSA does not require a memory. policy = BoltzmannQPolicy() sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) model_fn = 'sarsa_{}_weights.h5f'.format(ENV_NAME); if os.path.isfile(model_fn): sarsa.load_weights(model_fn) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. #sarsa.fit(env, nb_steps=50000,nb_max_episode_steps=500, visualize=False, verbose=2) # After training is done, we save the final weights. #sarsa.save_weights('sarsa_{}_weights.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True)
class DQN: def __init__( self, env="CartPole-v1", emulateOculus=True, visualize=True, teachingFilesPath=None, policyValues={ "inner_policy": EpsGreedyQPolicy(), "attr": "eps", "value_max": 0.75, "value_min": .01, "value_test": .0, "nb_steps": 50000 }, dobotEmulation=False): self.policyValues = policyValues os.environ[ "PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' physical_devices = tf.config.experimental.list_physical_devices('GPU') print("physical_devices-------------", len(physical_devices)) tf.config.experimental.set_memory_growth(physical_devices[0], True) self.episodeLength = 25 if env == "CartPole-v1": self.env = gym.make('CartPole-v1') self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.n self.saveFileName = 'sarsa_weights.h5f' logdir = "logs/CartPoleV1/" + datetime.now().strftime( "%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True elif env == "Dobot": self.env = dobotGym.dobotGym(emulateOculus=emulateOculus, episodeLength=self.episodeLength, visualize=visualize, teachingFilesPath=teachingFilesPath, dobotEmulation=dobotEmulation) self.states = self.env.observation_space.shape[0] self.actions = self.env.action_space.shape[0] self.saveFileName = 'sarsa_weights_dobot.h5f' logdir = "logs/Dobot/" + datetime.now().strftime("%Y%m%d-%H%M%S") self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir) self.visualize = True else: raise TypeError("Wrong env") print( 'States', self.states ) # To get an idea about the number of variables affecting the environment print( 'Actions', self.actions ) # To get an idea about the number of possible actions in the environment, do [right,left] # # episodes = 10 # for episode in range(1, episodes + 1): # # At each begining reset the game # state = self.env.reset() # # set done to False # done = False # # set score to 0 # score = 0 # # while the game is not finished # while not done: # # visualize each step # self.env.render() # # choose a random action # action = random.choice([0, 1]) # # execute the action # n_state, reward, done, info = self.env.step(action) # # keep track of rewards # score += reward # print('episode {} score {}'.format(episode, score)) # not working :( # self.agent = self.agentDDP(self.states, self.actions) # self.agent = self.NAFAgent(self.states, self.actions) # self.policy = EpsGreedyQPolicy() self.savingFreq = 100 self.actualSaving = 0 self.model = self.agentSarsa(self.states, self.actions) self.policy = LinearAnnealedPolicy( inner_policy=self.policyValues["inner_policy"], attr=self.policyValues["attr"], value_max=self.policyValues["value_max"], value_min=self.policyValues["value_min"], value_test=self.policyValues["value_test"], nb_steps=self.policyValues["nb_steps"]) self.agent = SARSAAgent(model=self.model, policy=self.policy, nb_actions=self.actions) self.agent._is_graph_network = True def t(): return False self.agent._in_multi_worker_mode = t self.agent.save = self.saveAgentWeights def lenmeh(): return self.actions # self.agent.__len__ = lenmeh def saveAgentWeights(self, path, overwrite=True): if self.actualSaving < self.savingFreq: self.actualSaving += 1 return None else: self.actualSaving = 0 path = 'model/checkpoint/' + datetime.now().strftime( "%Y%m%d-%H%M%S") + self.saveFileName self.agent.save_weights(path, overwrite) def agentSarsa(self, states, actions): self.model = Sequential() self.model.add(LSTM(42, activation='sigmoid', input_shape=(1, states))) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(42, activation='sigmoid')) self.model.add(Dense(24, activation='sigmoid')) self.model.add(Dense(12, activation='sigmoid')) self.model.add(Dense(actions, activation='linear')) self.path = fileOperation.saveToFolder(self.model.to_json(), name='modelShape', folder="model\\checkpoint") # , stateful=False states are resetted together after each batch. # model.add(Flatten(input_shape=(1, states))) # dot_img_file = '/model_1.png' # keras.utils.plot_model(self.model, to_file=dot_img_file, show_shapes=True) # model.reset_states() return self.model def load(self): path = fileOperation.openDialogFunction(".h5f") self.agent.compile('adam', metrics=['mse']) self.agent.load_weights(path) self.agent.compile('adam', metrics=['mse']) def test(self, nb_episodes=2): _ = self.agent.test(self.env, nb_episodes=nb_episodes, visualize=self.visualize) def fit(self, visualize=False): checkpoint_filepath = 'model/checkpoint/' model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath, save_weights_only=False, save_freq=25) self.agent.compile('adam', metrics=['mse']) self.agent.fit( self.env, nb_steps=self.policyValues["nb_steps"], log_interval=self.episodeLength, visualize=visualize, verbose=1, nb_max_start_steps=1, start_step_policy=self.model.reset_states, # callbacks=[PlotLossesKeras()]) callbacks=[self.tensorboard_callback, model_checkpoint_callback], ) scores = self.agent.test(self.env, nb_episodes=5, visualize=visualize) print('Average score over 5 test games:{}'.format( np.mean(scores.history['episode_reward'])))
verbose=2, nb_max_episode_steps=500, callbacks=[tb]) # 20s episodes # print history print("history contents : ", hist.history.keys()) # episode_reward, nb_episode_steps, nb_steps # summarize history for accuracy import matplotlib.pyplot as plt plt.plot(hist.history['episode_reward']) plt.plot(hist.history['nb_episode_steps']) plt.title('learning') plt.xlabel('episode') plt.legend(['episode_reward', 'nb_episode_steps'], loc='upper left') plt.show() # save history with open('_experiments/history_' + filename + '.pickle', 'wb') as handle: pickle.dump(hist.history, handle, protocol=pickle.HIGHEST_PROTOCOL) # After training is done, we save the final weights. sarsa.save_weights('h5f_files/dqn_{}_weights.h5f'.format(filename), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. sarsa.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=500) if mode == 'test': sarsa.load_weights('h5f_files/dqn_{}_weights.h5f'.format(filename)) sarsa.test(env, nb_episodes=10, visualize=True, nb_max_episode_steps=400) # 40 seconds episodes
nb_steps_warmup=3, policy=policy) agent.compile(Adam(lr=1e-3), metrics=['mae']) agent.reset_states() #=========================================================================# # re-use weights if possible if (os.path.isfile(inv_weights_fname)): inverse_model.load_weights(inv_weights_fname) if (os.path.isfile(fwd_weights_fname)): forward_model.load_weights(fwd_weights_fname) if (os.path.isfile(agent_weights_fname)): agent.load_weights(agent_weights_fname) # else: # FIXME: this bit is necessary or agent does nothing??? # probably initializes values or something # agent.fit(env, nb_steps=20, visualize=False) agent.training = True # IMPORTANT!!! or it doesn't learn #=========================================================================# episode_count = 1000 reward = 0 done = False for i in range(episode_count): print "episode=%d" % i obs_now = env.reset()
#memory = SequentialMemory(limit=50000, window_length=1) policy = EpsGreedyQPolicyC4(env) # dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, # target_model_update=1e-2, policy=policy,test_policy=policy) # dqn.compile(Adam(lr=1e-3), metrics=['mae']) sarsa = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, policy=policy, test_policy=policy) sarsa.compile(Adam(lr=1e-3), metrics=['mae']) # Load weights try: #dqn.load_weights(weights_filename) sarsa.load_weights(weights_filename) except OSError: print("no saved weights found") # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. #dqn.fit(env, nb_steps=5000000, visualize=False, verbose=2) sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1, callbacks=[WandbCallback()]) # After training is done, we save the final weights. #dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME), overwrite=True)