def __init__(self): self.frame_skip = 4 self.exp_scale = 0.1 self.time_scale = 1 # self.env = gym.make("Pong-v0") # self.env.unwrapped.frameskip = self.frame_skip self.env = PenaltyEnvironment() self.steps_to_train = int(self.time_scale * 5e7) self.agent = DQNAgent(num_actions=self.env.action_space.n, experience_replay_capacity = 1e6 * self.exp_scale, frame_skip=self.frame_skip, starting_experience = 5e4 * self.exp_scale, discount = 0.99, batch_size = 32, update_frequency = 4 * self.time_scale, target_update_frequency = 1e4 * self.time_scale, starting_epsilon = 1, final_epsilon = 0.1, final_epsilon_step = 1e6 * self.time_scale) self.steps_played = 0 self.env.reset() self.sequence = StateSequence([84, 84], 4) self.processor = StateProcessor()
def initialize(self): # Initialize game parameters. self.env = T9Desk() state_size = self.env.observation_space action_size = self.env.action_space_size self.agent = DQNAgent(state_size, action_size) self.agent.load("T9-dqn.h5")
def remember(): state = np.array(request.get_json()['state']) action = request.get_json()['action'] reward = request.get_json()['reward'] next_state = np.array(request.get_json()['next_state']) done = request.get_json()['done'] DQNAgent.remember(state, action, reward, next_state, done) return 'ok'
class DQNPlatform: def __init__(self): self.frame_skip = 4 self.exp_scale = 0.1 self.time_scale = 1 # self.env = gym.make("Pong-v0") # self.env.unwrapped.frameskip = self.frame_skip self.env = PenaltyEnvironment() self.steps_to_train = int(self.time_scale * 5e7) self.agent = DQNAgent(num_actions=self.env.action_space.n, experience_replay_capacity = 1e6 * self.exp_scale, frame_skip=self.frame_skip, starting_experience = 5e4 * self.exp_scale, discount = 0.99, batch_size = 32, update_frequency = 4 * self.time_scale, target_update_frequency = 1e4 * self.time_scale, starting_epsilon = 1, final_epsilon = 0.1, final_epsilon_step = 1e6 * self.time_scale) self.steps_played = 0 self.env.reset() self.sequence = StateSequence([84, 84], 4) self.processor = StateProcessor() def train(self, sess): for n in range(self.steps_to_train): state = self.sequence.get_sequence() action = self.agent.select_action(sess, state) obs, reward, done, info = self.env.step(action) self.sequence.append_obs(self.processor.process(sess, obs)) state_prime = self.sequence.get_sequence() self.agent.learn(sess, state, action, reward, state_prime, done) self.env.render() if done: self.env.reset() self.steps_played += 1 if n % 10000 == 0: print(self.steps_played / self.steps_to_train * 100, "%") def test(self, sess): self.agent.epsilon = 0 done = False self.env.reset() while not done: state = self.sequence.get_sequence() action = self.agent.select_action(sess, state) obs, reward, done, info = self.env.step(action) self.sequence.append_obs(self.processor.process(sess, obs)) self.env.render() sleep(0.1)
def __init__(self, env, qnet): self.TrainPolicy = GreedyEpsPolicy(0.8) self.TestPolicy = GreedyEpsPolicy(0.0) DQNAgent.__init__(self, env, qnet, gamma=0.9, train_policy=self.TrainPolicy, test_policy=self.TestPolicy, steps_between_train = 1000, episodes_between_train = 1, train_sample_size = 50, train_rounds = 40, trains_between_updates = 1 )
class T9Handler(tornado.websocket.WebSocketHandler): def initialize(self): # Initialize game parameters. self.env = T9Desk() state_size = self.env.observation_space action_size = self.env.action_space_size self.agent = DQNAgent(state_size, action_size) self.agent.load("T9-dqn.h5") def open(self): print("WebSocket opened") self.initialize() # self.write_message(u"Dimas sends greetings!") print(self.env.state_json) self.write_message(self.env.state_json) def on_message(self, message): data = json.loads(message) print('MESSAGE:', message) print('DATA:', data) # print('DATA:', data[0], data[1]) # self.env.tuzdyk = {'p1': 1, 'p2': 2} # self.write_message(u"Your message was: " + message) if data[0] == 'action_': action = data[1] who_moves = self.env.who_move if (action > 10) & who_moves: action -= 10 elif (action < 10) & who_moves: action += 10 if action < 10: self.env.step(action) self.write_message(self.env.state_json) elif data[0] == 'action': action = data[1] pr, op = self.env.who_moves_str state, reward, done, _ = self.env.step(action) self.write_message(self.env.state_json) self.env.render() # <insert agent> action_space = self.env.action_space[pr] # action = action_space[np.random.randint(0, action_space.size)] action = self.agent.act(state) + 1 # <\insert agent> time.sleep(2) self.env.step(action) self.write_message(self.env.state_json) self.env.render() def on_close(self): print("WebSocket closed")
def main(_): env = gym.make(FLAGS.env_name) num_actions = env.action_space.n num_features = env.observation_space.shape[0] print("num_actions", num_actions, "num_features", num_features) agent = DQNAgent( num_actions, num_features, learning_rate=FLAGS.learning_rate, reward_decay=FLAGS.reward_decay, e_greedy=FLAGS.e_greedy, replace_target_iter=FLAGS.replace_target_iter, memory_size=FLAGS.memory_size, batch_size=FLAGS.batch_size, e_greedy_increment=None, ) eps = 0 score_list = [] while True: eps += 1 obs = env.reset() obs = obs.reshape(-1, num_features) score = 0 for step in range(FLAGS.max_step): #print("step",step) #action = randint(0,num_actions-1) # action = agent.take_random_action() action = agent.choose_action(obs) obs_, rew, done, info = env.step(action) score += rew obs_ = obs_.reshape(-1, num_features) agent.store_transition(obs, action, rew, obs_) agent.train_model() if FLAGS.display: env.render() #print("x :",obs_[10],"y :",obs_[16]) if step < 80: continue # if step>160: # break #delta_obs =obs-obs_ obs = obs_ #print('delta_obs',delta_obs,'rew',rew,'done',done,'info',info) #print('delta_obs\n',delta_obs) #time.sleep(0.1) if done or info['ale.lives'] < 3: print(eps, "th episode with reward ", score) score_list.append(score) np.save(FLAGS.path_name + 'dqn_mspacma.npy', score_list) break if np.mean(score_list[-10:]) > 1000: agent.save_model(FLAGS.path_name + 'dqn_mspacman.h5') break
def __init__(self, host, game, player, max_steps=4, verbose=False): self._client = mqtt.Client() self._game = game self._player = player self._verbose = verbose self._agent = DQNAgent(game, True) self._repeated_actions = 0 self._steps = 0 self._max_actions = max_steps self._version = 'v0.1.2' self._client.connect(host) self._client.subscribe('Ctrl/Zumo/#', 0) self._client.on_message = self.on_message
def train(self, show=True, print_scores=True, n_generations=100, update_epsilon=True, model_path='', random_state=False, max_value=30): self.n_generations = n_generations for i in range(1, n_generations + 1): game = Game(show=show, max_value=max_value, batch_size=16) game.add_player(0, random_state=random_state) if self.mode == "naive": agent = NaiveQAgent(n_moves, n_states, self.model, self.epsilon, self.alpha, self.gamma) elif self.mode == "dqn": agent = DQNAgent(n_moves, n_states, self.model, self.epsilon, self.alpha, self.gamma) else: print("Invalide mode") return agent.max_value = max_value game.players[0].set_agent(agent) game.players[0].age = 20 score = game.run() if update_epsilon: self.updateEpsilon() if print_scores: print("Score at the %s-th iteration : %s" % (i, score)) else: if not i % 50: print("%s-th iterations" % i) if model_path and not i % 50: self.save_model(model_path) self.scores.append(score)
def main(): network = DuelingCNN(C.FILENAME, [5], (11, 11, 3), C.NUM_ACTIONS, C.Q_LEARNING_RATE) target = network.copy() replay = PrioritizedReplayBuffer(C.REPLAY_CAPACITY, alpha=C.ALPHA) agt = DQNAgent(network, replay) agent_list = [agt, DullAgent(), DullAgent(), DullAgent()] # Make the "Free-For-All" environment using the agent list env = pommerman.make('PommeFFACompetition-v0', agent_list) total_time = 0 for i_episode in range(1000000): state = env.reset() done = False while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) total_time += 1 if total_time % C.UPDATE_RATE == 0: target = network.copy() if i_episode > 3: train(network, target, replay) print('Episode {} finished'.format(i_episode)) if i_episode % 20 == 0: network.save() env.close()
def train(): agent = DQNAgent(state_size, action_size) # initialise agent #choice = raw_input("Name weight: ") #filename = output_dir + "weights_" + choice + ".hdf5" #agent.load(filename) batch_size = 32 n_episodes = 10001 # n games we want agent to play (default 1001) if not os.path.exists(output_dir): os.makedirs(output_dir) done = False for e in range(n_episodes): # iterate over new episodes of the game state = env.reset( ) # reset state at start of each new episode of the game state = np.reshape(state, [1, state_size]) score = 0 for time in range( 100 ): # time represents a frame of the game; goal is to keep pole upright as long as possible up to range, e.g., 500 or 5000 timesteps # env.render() action = agent.act( state ) # action is either 0 or 1 (move cart left or right); decide on one or other here next_state, reward, done, _ = env.step( action ) # agent interacts with env, gets feedback; 4 state data points, e.g., pole angle, cart position #reward = reward if not done else -1000 # reward +1 for each additional frame with pole upright next_state = np.reshape(next_state, [1, state_size]) agent.remember( state, action, reward, next_state, done ) # remember the previous timestep's state, actions, reward, etc. state = next_state # set "current state" for upcoming iteration to the current next state score = score + reward if done: # episode ends if agent drops pole or we reach timestep 5000 print( "episode: {}/{}, score: {}, e: {:.2}, time: {}, x: {:.2}" # print the episode's score and agent's epsilon .format(e, n_episodes, score, agent.epsilon, time, state[0, 0])) break # exit loop if len(agent.memory) > batch_size: agent.replay( batch_size ) # train the agent by replaying the experiences of the episode if e % 500 == 0: agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5") #print(env.get_traj()) #env.plot_traj() env.plot_traj()
def main(): warnings.simplefilter(action='ignore', category=FutureWarning) set_global_seed(0) env = rlcard.make('limit-holdem', config={'record_action': True}) human_agent = HumanAgent(env.action_num) dqn_agent = DQNAgent(env.action_num, env.state_shape[0], hidden_neurons=[1024, 512, 1024, 512]) dqn_agent.load(sys.argv[1]) env.set_agents([human_agent, dqn_agent]) play(env)
def main(): env = Game() env.start() agent = DQNAgent(env) MAX_EPISODES = 500 MAX_STEPS = 5000 BATCH_SIZE = 32 episode_rewards = mini_batch_train(env, agent, MAX_EPISODES, MAX_STEPS, BATCH_SIZE)
def run(environment, model_name, key=None): tdir = tempfile.mkdtemp() env = gym.make(environment) env = gym.wrappers.Monitor(env, tdir, force=True) agent = DQNAgent(env, trained_model=model_name) EPISODES = 100 for episode in range(EPISODES): state, reward, done = env.reset(), 0.0, False action = agent.action(state, reward, done, episode, training=False) while not done: #env.render() next_state, reward, done, _ = env.step(action) state = next_state action = agent.action(state, reward, done, episode, training=False) env.close() if key: gym.upload(tdir, api_key=key) shutil.rmtree(tdir)
def play(): env = gym.make('MsPacman-v0') agent = DQNAgent(LEARNING_RATE, IMG_ROWS, IMG_COLS, IMG_CHANNELS, INITIALIZE_STDDEV) print("Now we load weight") agent.model.load_weights(WEIGHT_PATH + "model.h5") print("Weight load successfully") step = 0 x_t = env.reset() while step < 80: env.render() env.step(0) step += 1 loss = 0 total_reward = 0 epsilon = INITIAL_EPSILON env.reder() x_t,_,_,_ = env.step(0) x_t = skimage.color.rgb2gray(x_t) x_t = skimage.transform.resize(x_t, (IMG_ROWS, IMG_COLS), mode='constant') x_t = skimage.exposure.rescale_intensity(x_t, out_range=(0, 255)) s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) s_t = s_t.reshape((1, s_t.shape[0], s_t.shape[1], s_t.shape[2])) for step in range(MAX_STEPS): env.render() # choose an action epsilon greedy a_t = np.zeros([ACTIONS]) q = agent.model.predict(s_t) print("TIMESTEP", step, "/ ACTION_PREDICTION", q) action_index = np.argmax(q) a_t[action_index] = 1 # run the selected action and observed next state and reward x_t1_colored, r_t, terminal, info = env.step(action_index) total_reward += r_t x_t1 = process_image(x_t1_colored) s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3) s_t = s_t1 # print info print("TIMESTEP", step, "/ ACTION", action_index, "/ REWARD", r_t, "/ Loss ", loss, "/ EPSILON", epsilon) if terminal: break print("Game ended, Total rewards: " + str(total_reward))
def main(args): env = gym.make(args.env) writer = SummaryWriter(comment="CartPole-v0-DQN") totalReward = [] actionDim = env.action_space.n stateDim = env.observation_space.shape[0] hiddenDim = args.hiddenDim buffer = UniformReplayBuffer(args.maxCapacity, env.observation_space.shape, np.float32, np.long) dqnAgent = DQNAgent(buffer, stateDim, actionDim, hiddenDim, args) stepCounter = 0 epsilon = args.epsStart for e in range(args.numberOfEpisode): state = env.reset() episodeReward = 0 done = False while not done: stepCounter += 1 action = dqnAgent.GetAction(state, epsilon) nextState, reward, done, _ = env.step(action) buffer.push_transition( Transition(state, action, reward, nextState, done)) episodeReward += reward if stepCounter > 2 * args.batchSize: dqnAgent.Update(stepCounter) epsilon = max(epsilon * args.epsDecay, args.epsStop) state = nextState totalReward.append(episodeReward) meanReward = float(np.mean(totalReward[-100:])) writer.add_scalar("episodeReward", episodeReward, stepCounter) writer.add_scalar("meanReward", meanReward, stepCounter) writer.add_scalar("epsilon", epsilon, stepCounter) writer.add_scalar("episodes", e, stepCounter) print( "Eps:{} Steps:{} Mean Reward: {} Episode Reward: {} Epsilon: {}". format(e, stepCounter, meanReward, episodeReward, epsilon))
def main(): game.set_mode('manual') game.setup(skip_setup) if debug: game.play() if mqtt: server = Server("127.0.0.1", game, player, verbose=True) server.play(games_total) else: game.set_mode('random') agent = DQNAgent(game, skip_training) agent.train(games_start, games_total) agent.validate(games_total, validation_games, validation_max_steps) agent.play(games_total, game_max_steps)
def main(): np.set_printoptions(suppress=True, formatter={'float_kind': '{:0.2f}'.format}) env_fns = [make_env('MountainCar-v0', i) for i in range(4)] try: env = SyncVectorEnv(env_fns) finally: env.close() state_size = env.observation_space.shape[1] action_size = env.action_space[0].n NUM_EPISODES = 1000 STEPS_PER_EPISODE = 200 batch_size = 32 eps_mean_reward = [0.0] * NUM_EPISODES agent = DQNAgent(state_size, action_size) start_time = datetime.now() for ep_count in range(NUM_EPISODES): episode_rew = 0 state = env.reset() if (ep_count == 0): print("ep={} state.shape={}".format(ep_count, state.shape)) #state = np.reshape(state, [-1, state_size]) ep_start_time = datetime.now() for time in range(STEPS_PER_EPISODE): # env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) episode_rew += np.sum(reward) #next_state = np.reshape(next_state, [-1, state_size]) if (time == 0): print("ep={} time={} action.len={} next_state.shape={} elaps_time={}".format( \ ep_count, time, len(action), next_state.shape, (datetime.now() - ep_start_time)) ) #add to DQN buffer for idx in range(0, env.num_envs): agent.memorize(state[idx], action[idx], reward[idx], next_state[idx], done[idx]) state = next_state if time >= STEPS_PER_EPISODE - 1: eps_mean_reward[ep_count] = np.mean(episode_rew) / time print("ep: {}/{}, mean_avg_reward: {}, exec_time= {}".format( \ ep_count , NUM_EPISODES, eps_mean_reward[ep_count], (datetime.now() - ep_start_time))) #update DQN model if there are enough samples if len(agent.memory) > batch_size and time % 8 == 0: agent.replay(batch_size) #if ep_count % 2 == 0: # agent.save(str(os.path.join(save_path,'ma-foraging-dqn.h5'))) print("Finish train DQN Agent with {} episodes in {}".format( NUM_EPISODES, (datetime.now() - start_time)))
class ReinforcedTablicPlayer(TablicPlayer): def __init__(self, gamma): self.agent = DQNAgent(gamma) def load_model(self, model_path): self.agent = torch.load(model_path) def save_model(self, model_path): torch.save(self.agent, model_path) @classmethod def take_to_state_action(cls, state_vector, played_card, take): take_vector = Tablic.get_take_vector(played_card, take) result = np.concatenate((take_vector, state_vector)) return torch.from_numpy(result).type(torch.cuda.FloatTensor) @classmethod def get_valid_state_actions(cls, game): hand = game.get_hand(game.current_player) observation = game.get_observation_vector(game.current_player) valid_takes = list(Tablic.get_valid_takes(game.table, hand)) valid_state_actions = torch.zeros([len(valid_takes), 80]).type(torch.cuda.FloatTensor) for ind, (played_card, take) in enumerate(valid_takes): valid_state_actions[ind] = cls.take_to_state_action( observation, played_card, take) return valid_takes, valid_state_actions def find_best_play_from_state_actions(self, takes, state_actions): with torch.no_grad(): takes_value = self.agent.forward(state_actions) best_take_ind = torch.argmax(takes_value) return takes[best_take_ind] def get_random_play_from_state_actions(self, valid_takes, valid_state_actions): return random.choice(valid_takes) def find_best_play(self, game): return self.find_best_play_from_state_actions( *self.get_valid_state_actions(game)) def get_random_play(self, game): return random.choice( list( Tablic.get_all_valid_takes(game.table, game.get_hand( game.current_player))))
def main(): env = gym.make("FightingiceDataNoFrameskip-v0", java_env_path="/home/rurito/lesson/ken/FTG4.50") # HACK: aciontから自動で取ってこれるようにしておく action_size = 56 learning_rate = 0.1 batch_size = 10 episode = 3 gamma = 0.1 greedy_value = 0.3 p2 = "MctsAi" env = Observer(env, p2) agent = DQNAgent(learning_rate, action_size, greedy_value) agent.model.load_model('param.hdf5') # agent = RoleBaseAgent() trainer = Trainer(env, agent) trainer.train(episode, batch_size, gamma)
def train(environment, model_name=None, key=None): tdir = tempfile.mkdtemp() env = gym.make(environment) env = gym.wrappers.Monitor(env, tdir, force=True) agent = DQNAgent(env) EPISODES = 5000 for episode in range(EPISODES): state, reward, done = env.reset(), 0.0, False action = agent.action(state, reward, done, episode) while not done: #env.render() next_state, reward, done, _ = env.step(action) agent.store(state, action, reward, next_state, done) state = next_state action = agent.action(state, reward, done, episode) if model_name and (episode == EPISODES - 1 or episode % 10 == 0): agent.save_model(filename=model_name) pass env.close() if key: gym.upload(tdir, api_key=key) shutil.rmtree(tdir)
def replay(): if len(DQNAgent.memory) >= SAMPLE_SIZE: print("----------------------------------------") print("---> Starting experience replay...") start_time = time.time() losses = [] accuraces = [] # DQNAgent.replay() for i in range(0, 100): loss, acc = DQNAgent.replay() losses.append(sum(loss) / len(loss)) accuraces.append(sum(acc) / len(acc)) # print(i, '-', 'Loss:', (sum(loss)/len(loss)), 'Accuracy:', (sum(acc)/len(acc))) # losses, accuraces = DQNAgent.replay() print('--->', 'Loss:', (sum(losses) / len(losses)), 'Accuracy:', (sum(accuraces) / len(accuraces))) # print(DQNAgent.epsilon) elapsed_time = round(time.time() - start_time, 2) print("---> Experience replay took: ", elapsed_time, " seconds") # print("----------------------------------------") return 'ok'
def main(): env = Game() env.start() agent = DQNAgent(env) state = torch.from_numpy(np.zeros((4, 160, 240))) i = 0 MAX_ESPISODES = 20 episodes = [] scores = [] # episodes = np.arange(500) # scores = np.random.randn(1, 500) while i <= MAX_ESPISODES: action = agent.get_action(state) observation, reward, done = env.step(action) prev_state = state state = torch.cat((state[1:], torch.from_numpy(np.array([observation]))), axis=0) agent.update_buffer(prev_state, action, reward, state, done) if done: i += 1 episodes.append(i) scores.append(env.result) agent.update(batch_size=20) env.reset() print('Episodes') print(episodes) print('Scores:') print(scores) plt.scatter(episodes, scores, s=1) plt.xlabel('Episodes') plt.ylabel('Score') plt.title('Deep Q-Learning Agent') plt.savefig('score-500-episodes.png')
def main(): parser = argparse.ArgumentParser( 'Train or Evaluate a DQN Agent for OpenAI ' 'Gym Atari Environments') parser.add_argument('--env', '-e', default=ENV_NAME) parser.add_argument('--evaluate', action='store_true', default=False) parser.add_argument('--load_weights', '-l', default=None) parser.add_argument('--render', '-r', action='store_true', default=False) args = parser.parse_args() env_name = args.env weights_to_load = args.load_weights evaluate = args.evaluate render = args.render env = gym.make(env_name) model = ConvModel(env, learning_rate=2.5e-4, momentum=0.95, gamma=0.99, tau=0.01, soft_updates=True, weights_to_load=weights_to_load, grayscale=False, window_size=8) agent = DQNAgent(env, model, linear_epsilon_decay=True, epsilon_decay_steps=3.e6, epsilon=1.0, min_epsilon=0.06, exp_buffer_size=1000000, batch_size=256, render=render, update_freq=1, random_starts=30, max_steps=10000) if evaluate: agent.evaluate() else: agent.train()
from GlobalVariables import GlobalVariables from DQNAgent import DQNAgent import numpy as np import matplotlib.pyplot as plt import pylab import sys Extract = Extract_Features options = GlobalVariables #To access global variables from GlobalVariable.py parameter = GlobalVariables #To access parameters from GlobalVariables.py samples = Extract_Features #To access the member functions of the ExtractFeatures class grid_size = GlobalVariables #To access the size of grid from Global Variables.py env = Environment(grid_size.nRow, grid_size.nCol) agent = DQNAgent(env) list = [] for i in range(1, parameter.how_many_times + 1): print( "************************************************************************************" ) print("Iteration", i) Number_of_Iterations = [] Number_of_Episodes = [] reward_List = [] filename = str(grid_size.nRow) + "X" + str( grid_size.nCol) + "_Experiment.txt" for episode in range(1, parameter.Number_of_episodes + 1): #file = open(filename, 'a') #done = False
from Scenario import Scenario from DQNAgent import DQNAgent import numpy as np EPISODES = 2000 if __name__ == "__main__": env = Scenario() state_size = env.state_size action_size = env.action_size agent = DQNAgent(state_size, action_size) done = False batch_size = 32 for e in range(EPISODES): state = env.reset() print("Size: " + str(state.shape)) state = np.reshape(state, [1, state_size]) time = 0 done = 0 while not done: action = agent.act(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done: agent.update_target_model() print("episode: {}/{}, score: {}, e: {:.2}" .format(e, EPISODES, time, agent.epsilon)) break time = time + 1
esp = 0.01 else: esp = 1 robot = simulated_1D_robot(goalie_pos_start=3, GRID_NUM_HEIGHT=num_grid_y, GRID_NUM_WIDTH=num_grid_x, GRID_SIZE=10, draw_scene=draw_scene, gridworld=True) agent = DQNAgent( #state_size =num_grid_y*num_grid_x, state_size=2, action_size=3, gamma=0.95, epsilon=esp, epsilon_min=0.01, epsilon_decay=0.995, learning_rate=0.001, model_type='DeepModel') EPOCHS = 5000 if not draw_scene: UPDATE_FREQ = 1000000 else: UPDATE_FREQ = 1000000 batch_size = 32 start = time.time() done = False
# Get the screens: s = [] while len(s) < 4: s.append(self.get_screen(self.driver)) # Get our images normalized s = self.get_normalized_input(s) # Return the values return s, self.get_distance_ran(self.driver), self.get_game_over( self.driver) if __name__ == '__main__': env = DinoEnvironment() agent = DQNAgent((150, 300), action_size=3, memory_size=5000) max_score = 0 for i in range(100): state, score, done = env.reset() print("state shape: {}".format(state.shape)) a = 0 while env.get_game_over() is False: action = agent.act(state) next_state, score, done = env.act(env.action_space[action]) reward = score if not done else -10 agent.remember(state, action, reward, next_state, done) state = next_state a += 1
from Desk import T9Desk from DQNAgent import DQNAgent import numpy as np env = T9Desk("random_1", "Deep QN_2") state_size = env.observation_space action_size = env.action_space_size agent = DQNAgent(state_size, action_size) agent.load("T9-dqn.h5") for i in range(100): state = env.reset(False) done = False # print(i) while not done: pr, op = env.who_moves_str if pr == 'p2': action_space = env.action_space[pr] action = action_space[np.random.randint(0, action_space.size)] else: action = agent.act(state) + 1 next_state, reward, done, _ = env.step(action) env.render() score_sum = env.win_count['p1'] + env.win_count['p2'] + env.win_count['draw']
elif layer['type'] == 'flatten': layerStack = keras.layers.Flatten()(layerStack) # Model's output layerStack = keras.layers.Dense( env.action_space.n, activation='linear', kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0))(layerStack) # Initialize a new model agent = DQNAgent(inputs, layerStack, memSize=config['agent']['replayMemorySize'], stackedStateLength=config['agent']['stackedStateLength'], stateScaleFactor=config['agent']['stateScaleFactor'], epsilonPolicy=epsilonPolicy, optimizer=optimizer, loss=config['model']['lossFunction'], batchSize=config['model']['batchSize'], modelName=config['model']['name']) # If required load the old model for futher learning if config['paths']['initialModelName'] != False: modelToLoad = os.path.join(config['paths']['savesDir'], env.unwrapped.spec.id, 'models', config['paths']['initialModelName']) agent.loadModel(modelToLoad) # Load replay memory if needed if config['paths']['initialReplayMemoryName'] != False: replaysToLoad = os.path.join(config['paths']['savesDir'], env.unwrapped.spec.id, 'replays',