def __init__(self, width, height, rows, window, offx, offy, idx=""): self.SETTINGS = {} self.SETTINGS['w'] = width self.SETTINGS['h'] = height self.SETTINGS['r'] = rows self.SETTINGS['sB'] = width // rows self.SETTINGS['ox'] = offx * width self.SETTINGS['oy'] = offy * height self.idx = idx self.window = window self.snake = Snake((255, 0, 0), (self.SETTINGS['r'] // 2, self.SETTINGS['r'] // 2), self.SETTINGS) self.snack = Cube(self.randomSnack(), self.SETTINGS, color=(0, 255, 0)) self.dist = self.get_snack_distance() self.walls = self.get_wall_pos() self.model = Model(len(self.get_observation()), 4) self.tgt = Model(len(self.get_observation()), 4) self.agent = DQNAgent(self.model, self.tgt) self.reward = 0.0 self.setp_reward = 0.0 self.rewards = [] self.finished = False self.points = 0 self.points_ls = []
def main(): epi_file = open('../files/episode.txt') episode = epi_file.readline() epi_file.close() episode = int(episode) - 1 qagent = DQNAgent(episode - 1) qagent.load_memory_of_episode(episode) qys = [] qds = [] for k in range(50): for j in range(5): # for i in range(0,len(qagent.memory),qagent.batch_size): qy, qd = qagent.memory_replay() qagent.update_targer_model() qys.append(qy) qds.append(qd) qagent.save_model(episode) res = time.strftime('%Y/%m/%d-%H:%M:%S', time.localtime( time.time())) + "Average of episode: %d Q_y: %f Q_d: %f" % ( episode, np.mean(qys), np.mean(qds)) epi_file = open('../files/avg_Q.txt', 'a') epi_file.write(res + '\n') epi_file.close() if forward: epi_file = open('../files/episode.txt', 'w') epi_file.write(str(episode + 2)) epi_file.close()
def main(): USE_CUDA = torch.cuda.is_available() env = gym.make('CartPole-v0') dqn = DQN(env.observation_space.shape[0], env.action_space.n) if USE_CUDA: dqn = dqn.cuda() optimizer = optim.RMSprop(dqn.parameters(), lr=0.00025, momentum=0.95, alpha=0.95, eps=0.01) epsilon_schedule = get_epsilon_schedule(start=1.0, end=0.01, endt=1000, learn_start=50) replay_buffer = ReplayBuffer(capacity=1000) agent = DQNAgent(env, dqn, optimizer, epsilon_schedule, replay_buffer, discount_factor=0.99, target_update_rate=10, batch_size=32, learn_start=50) agent.train(5000) total_reward = agent.play(render=True) agent.env.close() print('Total Reward: ', total_reward)
def __init__(self, kwargs): kwargs["env_cls"] = Atari env = kwargs["env_cls"](kwargs["env_id"]) kwargs["state_shape"] = env.observation_space.shape kwargs["state_dtype"] = np.uint8 kwargs["n_actions"] = env.action_space.n kwargs["device"] = torch.device(kwargs["device_id"]) env.close() self.__dict__.update(kwargs) self.agent = DQNAgent(**kwargs) self.writer = SummaryWriter("./log/") self.cuda_eval = torch.cuda.Stream(self.device) mem_kwargs = dict( capacity=self.mem_capacity, history_len=self.history_len, state_shape=self.state_shape, state_dtype=self.state_dtype, batch_sz=self.batch_sz, alpha=self.mem_alpha, beta=LinearScheduler(self.mem_beta, 1., self.train_steps), priority_eps=self.mem_priority_eps, priority_upper=self.mem_priority_upper, prioritized_replay=self.prioritized_replay, device=self.device, ) mem_cls = PrioritizedReplayMemory if self.prioritized_replay else UniformReplayMemory self.mem = mem_cls(**mem_kwargs) self.mem_lock = Lock() self.sync = Queue(maxsize=1) self.sync.put(None)
def main(argv): # Pretrained network to use inputfile = None # Wether to train or to test train = False # Trained network outputfile = None try: opts, args = getopt.getopt(argv, "hrl:s:", ["loadckpt=", "saveckpt="]) except getopt.GetoptError: print 'Incorrect usage. For more information: test.py -h' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'python test.py -r -l <ckptfile> -s <ckptfile>' print '-r for enabling training' print '-l for loading pre-existing model' print '-s for saving model to file' sys.exit() elif opt == '-r': train = True elif opt in ("-l", "--loadckpt"): inputfile = arg elif opt in ("-s", "--saveckpt"): outputfile = arg with tf.Session() as sess: env = Environment() agent = DQNAgent(env, sess, inputfile) if train: agent.train(6000000, outputfile) else: agent.test(2000)
def run(novis, env_dir, env_file, n_episodes, seed, prioritized, cpu): if novis: env_dir = "{}_NoVis".format(env_dir) env = UnityEnvironment(file_name="environments/{}/{}".format(env_dir, env_file)) # get default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment # print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size # print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] # print('States look like:', state) state_size = len(state) # print('States have length:', state_size) report = Report(DQNAgent(state_size=state_size, action_size=action_size, seed=seed, prioritized=prioritized, cpu=cpu)).run(dqn, env=env, brain_name=brain_name, n_episodes=n_episodes) print(report)
def purge_round(): candidate_leaders_map = {} # {filename --> agent} # Load in all of the leaders for leader_checkpoint in os.listdir(LEADER_DIR): path = os.path.join(LEADER_DIR, leader_checkpoint) candidate_leader = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=leader_checkpoint)) candidate_leader.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) candidate_leaders_map[leader_checkpoint] = candidate_leader candidate_scores = [] # list[(filename, score)] filenames, candidate_leaders = zip(*candidate_leaders_map.items()) for i, (filename, candidate_leader) in enumerate(zip(filenames, candidate_leaders)): print "EVALUATING {}".format(candidate_leader.name) leaders = EnsembleDQNAgent(candidate_leaders[:i] + candidate_leaders[i + 1:]) candidate_scores.append((filename, evaluate(candidate_leader, leaders, EPISODES_EVALUATE_PURGE))) sorted_scores = sorted(candidate_scores, key=lambda x: x[1], reverse=True) print "SCORES: {}".format(sorted_scores) for filename, score in sorted_scores[NUM_LEADERS:]: print "PURGING ({}, {})".format(filename, score) leader_path = os.path.join(LEADER_DIR, filename) graveyard_path = os.path.join(GRAVEYARD_DIR, filename) os.rename(leader_path, graveyard_path)
def td_learning(args): agent = DQNAgent(args) replay_memory = PrioritizedReplayBuffer(1000000, args.alpha) #eval_game(agent, 500) outer = tqdm(range(args.total_steps), desc='Total steps', position=0) game = init_game() ave_score = 0 count = 0 for step in outer: board = copy.deepcopy(game.gameboard.board) if step < args.start_learn: avail_choices = game.gameboard.get_available_choices() index = np.random.randint(len(avail_choices)) choice = avail_choices[index] else: choice = agent.greedy_policy( board, game.gameboard.get_available_choices()) next_board, reward = game.input_pos(choice[0], choice[1]) next_board = copy.deepcopy(next_board) ##### replay_memory.add(board, choice, reward, next_board) ##### if game.termination(): ave_score += game.gameboard.score count += 1 game = init_game() if step >= args.start_learn and step % args.train_freq == 0: if count > 0: message = "ave score of " + str(count) + " game: " + str( ave_score / count) out_fd.write("{} {}\n".format(step, ave_score / count)) outer.write(message) ave_score = 0 count = 0 if step == args.start_learn: experience = replay_memory.sample(args.start_learn, beta=agent.beta) else: experience = replay_memory.sample(args.train_data_size, beta=agent.beta) boards, choices, rewards, next_boards, weights, batch_idxes = experience td_errors = agent.train( (boards, choices, rewards, next_boards, weights)) new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_memory.update_priorities(batch_idxes, new_priorities) agent.update_target(args.soft_tau) agent.update_epsilon() agent.update_beta() eval_game(agent, 500) out_fd.close()
def test_target_model(): agent = DQNAgent() agent.load('models/model.h5') state = np.zeros([6, 7]) state[5][3] = 1 state = state.reshape(1, 6, 7, 1) p1 = agent.policy_model.predict(state) p2 = agent.target_model.predict(state) print(p1) print(p2) if not np.array_equal(p1, p2): print('FAIL')
def __init__(self, env_creator, device, buffer_size, save_dir, timesteps_per_epoch=1, batch_size=32, total_steps=5 * 10 ** 5, decay_rate=0.1, init_epsilon=1, final_epsilon=0.02, loss_freq=50, refresh_target_network_freq=500, eval_freq=500, max_grad_norm=50): self.env_creator = env_creator self.env = env_creator() n_actions = self.env.action_space.n state_shape = self.env.observation_space.shape self.save_dir = save_dir self.buffer_size = buffer_size self.timesteps_per_epoch = timesteps_per_epoch self.batch_size = batch_size self.total_steps = total_steps self.decay_steps = decay_rate * total_steps self.init_epsilon = init_epsilon self.final_epsilon = final_epsilon self.loss_freq = loss_freq self.refresh_target_network_freq = refresh_target_network_freq self.eval_freq = eval_freq self.max_grad_norm = max_grad_norm self.device = device self.writer = SummaryWriter('runs') self.agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device) self.target_network = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device) self.target_network.load_state_dict(self.agent.state_dict())
def dqn_run(episodes=2500, eps_start=1.0, eps_end=0.01, eps_decay=0.995, double_dqn=False, dueling_dqn=False, seed=42): env = start_env() env_info = reset_env_info(env) state_size = get_state_size(env_info) action_size = get_action_size(env) print('Seed used:', seed) agent = DQNAgent(state_size, action_size, double_dqn, dueling_dqn, seed) scores = [] scores_window = deque(maxlen=100) eps = eps_start for episode in range(1, episodes + 1): env_info = reset_env_info(env) score = 0.0 done = False while not done: state = env_info.vector_observations[0] action = agent.act(state, epsilon=eps) env_info = env_step(env, action) next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) score += reward scores_window.append(score) scores.append(score) eps = max(eps * eps_decay, eps_end) print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.format( episode, episodes, np.mean(scores_window), eps), end=' ') if episode % 100 == 0: print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'. format(episode, episodes, np.mean(scores_window), eps)) if np.mean(scores_window) > 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') break env.close() return scores
def test_model(filename): env = gym.make("CartPole-v1") agent = DQNAgent(4, 2) agent.load_model(filename) state = env.reset() for _ in range(1000): env.render() state, _, done, _ = env.step(agent.act(state, explore=False)) if done: break env.close()
def __init__(self): pygame.init() self.window = pygame.display.set_mode((500, 800)) pygame.display.set_caption("Racing AI") self.clock = pygame.time.Clock() self.execute = True self.car = Car(250, 650, self.window) self.agent = DQNAgent(inputs=4, n_actions=2) self.episode_durations = [] self.update_agent = pygame.USEREVENT + 1 update_timer = 100 pygame.time.set_timer(self.update_agent, update_timer)
def eval(): env = Tetris() max_steps = None epsilon_stop_episode = 1500 mem_size = 20000 discount = 0.95 batch_size = 512 epochs = 1 replay_start_size = 2000 n_neurons = [32, 32] render_delay = None activations = ['relu', 'relu', 'linear'] agent = DQNAgent(env.get_state_size(), n_neurons=n_neurons, activations=activations, epsilon=0, epsilon_stop_episode=epsilon_stop_episode, mem_size=mem_size, discount=discount, replay_start_size=replay_start_size, train=False) agent.load("ckpts/591_model.ckpt") current_state = env.reset() done = False steps = 0 # Game while not done and (not max_steps or steps < max_steps): next_states = env.get_next_states() best_state = agent.best_state(next_states.values()) best_action = None for action, state in next_states.items(): if state == best_state: best_action = action break reward, done = env.play(best_action[0], best_action[1], render=True, render_delay=render_delay) agent.add_to_memory(current_state, next_states[best_action], reward, done) current_state = next_states[best_action] steps += 1
def main(): print("Creating model...") model = create_model() model.summary() print("Creating environment...") environment = gym.make("CartPole-v0") environment._max_episode_steps = 500 print("Creating agent...") if agent_type == "dqn": agent = DQNAgent(name="cartpole-dqn", model=model, environment=environment, observation_frames=1, observation_transformation=observation_transformation, reward_transformation=reward_transformation, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, number_of_iterations=1000000, replay_memory_size=2000, minibatch_size=32) elif agent_type == "ddqn": agent = DDQNAgent( name="cartpole-ddqn", model=model, environment=environment, observation_frames=1, observation_transformation=observation_transformation, reward_transformation=reward_transformation, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, number_of_iterations=1000000, replay_memory_size=2000, minibatch_size=32, model_copy_interval=100) agent.enable_rewards_tracking(rewards_running_means_length=10000) agent.enable_episodes_tracking(episodes_running_means_length=10000) agent.enable_maxq_tracking(maxq_running_means_length=10000) agent.enable_model_saving(model_save_frequency=100000) agent.enable_tensorboard_for_tracking() print("Training ...") agent.fit(verbose=True, headless="render" not in sys.argv)
def main(): print("Creating environment...") environment = gym_tetris.make('Tetris-v0') print("Creating model...") model = modelutils.create_model(number_of_actions) model.summary() print("Creating agent...") if agent_type == "dqn": agent = DQNAgent( name="tetris-dqn", environment=environment, model=model, observation_transformation=utils.resize_and_bgr2gray, observation_frames=4, number_of_iterations=1000000, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, replay_memory_size=2000, minibatch_size=32 ) elif agent_type == "ddqn": agent = DDQNAgent( name="tetris-ddqn", environment=environment, model=model, observation_transformation=utils.resize_and_bgr2gray, observation_frames=4, number_of_iterations=1000000, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, replay_memory_size=2000, minibatch_size=32, model_copy_interval=100 ) agent.enable_rewards_tracking(rewards_running_means_length=10000) agent.enable_episodes_tracking(episodes_running_means_length=100) agent.enable_maxq_tracking(maxq_running_means_length=10000) agent.enable_model_saving(model_save_frequency=10000) agent.enable_plots_saving(plots_save_frequency=10000) print("Training ...") agent.fit(verbose=True, headless="headless" in sys.argv, render_states=True)
def train(model_path='models/model.h5', opponent_policy=random_choice, num_episodes=1000, agent_params={}, **kwargs): stats = statistics.default_stats() plt_data = statistics.plot_stats(stats, data=None) agent = DQNAgent(**agent_params) for episode in range(num_episodes): print('Episode {}/{}'.format(episode, num_episodes)) env = Environment(opponent_policy=opponent_policy, agent_color=board.RED, agent_first_turn=True) done = False episode_length = 0 while not done: state = env.get_state() action = agent.act_epsilon_greedy(state) next_state, reward, event = env.step(action) done = event != board.EVENT_IN_GAME agent.remember(state, action, reward, next_state, done) agent.replay(stats=stats) if event == board.EVENT_WIN: print('Won Game!') episode_length += 1 stats['episode_results'].append(event) stats['episode_lengths'].append(episode_length) plt_data = statistics.plot_stats(stats, data=plt_data) plt.pause(0.0001) if episode % 100 == 0: agent.save(model_path) agent.save(model_path) saved_params = {'agent_params': agent_params, 'num_episodes': num_episodes} statistics.save_stats( stats, saved_params, "stats/stats-{}.json".format(time.strftime("%Y%m%d-%H%M%S"))) statistics.plot_stats(stats, data=plt_data) plt.show()
def main(): print("Creating model...") model = modelutils.create_model(number_of_actions=4) model.summary() print("Creating agent...") if agent_type == "dqn": agent = DQNAgent(name="doom-dqn", model=model, number_of_actions=4, gamma=0.99, final_epsilon=0.0001, initial_epsilon=0.1, number_of_iterations=200000, replay_memory_size=10000, minibatch_size=32) elif agent_type == "ddqn": agent = DDQNAgent(name="doom-ddqn", model=model, number_of_actions=4, gamma=0.99, final_epsilon=0.0001, initial_epsilon=0.1, number_of_iterations=200000, replay_memory_size=10000, minibatch_size=32, model_copy_interval=100) agent.enable_rewards_tracking(rewards_running_means_length=1000) agent.enable_episodes_tracking(episodes_running_means_length=1000) agent.enable_maxq_tracking(maxq_running_means_length=1000) agent.enable_model_saving(model_save_frequency=10000) agent.enable_plots_saving(plots_save_frequency=10000) print("Creating game...") #environment = Environment(headless=("headless" in sys.argv)) # Create an instance of the Doom game. environment = DoomGame() environment.load_config("scenarios/basic.cfg") environment.set_screen_format(ScreenFormat.GRAY8) environment.set_window_visible("headless" not in sys.argv) environment.init() print("Training ...") train(agent, environment, verbose="verbose" in sys.argv)
def DQN(episodes, epsilon, epsilonDeca): env = Env() agent = DQNAgent() #window=pygame.display.set_mode((windowWidth,windowHeight)) episodeRewards = [] for episode in range(episodes): episode_reward = 0 step = 1 current_state = env.reset() done = False while not done: # This part stays mostly the same, the change is to query a model for Q values if np.random.random() > epsilon: # Get action from Q table action = np.argmax(agent.getQs(np.array(current_state))) else: # Get random action action = np.random.randint(0, env.ACTION_SPACE_SIZE) new_state, reward, done = env.step(action) episode_reward += reward #drawWindow(window,[env.blob,env.enemyBlob],[env.ball],env.wall) # Every step we update replay memory and train main network agent.updateReplyMemory( (current_state, action, reward, new_state, done)) agent.train(done, step) current_state = new_state step += 1 episodeRewards.append(episode_reward) if episode % 10 == 0: averageReward = sum(episodeRewards) / len(episodeRewards) minReward = min(episodeRewards) maxReward = max(episodeRewards) print( f"replayMemo:{len(agent.replayMemory)} avg:{averageReward} \n min:{minReward} \n max:{maxReward} " ) if epsilon > MIN_EPSILON: epsilon *= EPSILON_DECAY epsilon = max(MIN_EPSILON, epsilon) pygame.quit()
def advise(): n1 = float(request.form['n1']) n2 = float(request.form['n2']) n3 = float(request.form['n3']) cash = float(request.form['cash']) print(n1) print(cash) agent = DQNAgent(state_size, action_size) scaler = get_scaler(env) agent.load("202005011635-dqn.h5") state = env.reset() state[0] = n1 state[1] = n2 state[2] = n3 state[-1] = cash state = scaler.transform([state]) action = agent.act(state) # action_combo = list(map(list, itertools.product([0, 1, 2], repeat=3))) action_vec = action_combo[action] # action_map = {0: "sell", 1: "hold", 2: "buy"} # print(action_map[action_vec[0]], action_map[action_vec[1]], action_map[action_vec[2]]) ans = [] tmp = 1 if action_vec[0] == 0 and n1 == 0 else action_vec[0] if cash == 0 and tmp == 2: tmp = 1 ans.append(action_map[tmp]) tmp = 1 if action_vec[1] == 0 and n2 == 0 else action_vec[1] if cash == 0 and tmp == 2: tmp = 1 ans.append(action_map[tmp]) tmp = 1 if action_vec[2] == 0 and n3 == 0 else action_vec[2] if cash == 0 and tmp == 2: tmp = 1 ans.append(action_map[tmp]) print(ans) return render_template('index.html', ans=ans, n1=n1, n2=n2, n3=n3, cash=cash)
def td_learning(args): agent = DQNAgent(args) replay_memory = deque(maxlen=args.MAX_REPLAY_MEMORY_SIZE) #eval_game(agent, 500) outer = tqdm(range(args.total_steps), desc='Total steps', position=0) game = init_game() ave_score = 0 count = 0 for step in outer: board = copy.deepcopy(game.gameboard.board) if step < args.start_learn: avail_choices = game.gameboard.get_available_choices() index = np.random.randint(len(avail_choices)) choice = avail_choices[index] else: choice = agent.greedy_policy( board, game.gameboard.get_available_choices()) next_board, reward = game.input_pos(choice[0], choice[1]) next_board = copy.deepcopy(next_board) replay_memory.append((board, choice, reward, next_board)) if game.termination(): ave_score += game.gameboard.score count += 1 game = init_game() if step >= args.start_learn and step % args.train_freq == 0: if count > 0: message = "ave score of " + str(count) + " game: " + str( ave_score / count) #out_fd.write("{} {}\n".format(step, ave_score/count)) outer.write(message) ave_score = 0 count = 0 if step == args.start_learn: if len(replay_memory) > 0: agent.train(replay_memory) else: agent.train(random.sample(replay_memory, args.train_data_size)) agent.update_target(args.soft_tau) agent.update_epsilon() eval_game(agent, 500)
def main(): print("Creating model...") model = modelutils.create_model(number_of_actions) model.summary() print("Creating agent...") if agent_type == "dqn": agent = DQNAgent(name="supermario-dqn", model=model, number_of_actions=number_of_actions, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, number_of_iterations=1000000, replay_memory_size=2000, minibatch_size=32) elif agent_type == "ddqn": agent = DDQNAgent(name="supermario-ddqn", model=model, number_of_actions=number_of_actions, gamma=0.95, final_epsilon=0.01, initial_epsilon=1.0, number_of_iterations=1000000, replay_memory_size=2000, minibatch_size=32, model_copy_interval=100) agent.enable_rewards_tracking(rewards_running_means_length=10000) agent.enable_episodes_tracking(episodes_running_means_length=100) agent.enable_maxq_tracking(maxq_running_means_length=10000) agent.enable_model_saving(model_save_frequency=10000) agent.enable_plots_saving(plots_save_frequency=10000) print("Creating game...") environment = gym_super_mario_bros.make("SuperMarioBros-v0") environment = BinarySpaceToDiscreteSpaceEnv(environment, actions) print("Training ...") train(agent, environment, verbose="verbose" in sys.argv, headless="headless" in sys.argv)
def main(cfg: omegaconf.DictConfig): # create the environment env = atari_wrappers.make_env(cfg.exp.env) env = gym.wrappers.Monitor(env, "recording/", force=True) obs = env.reset() # TensorBoard writer = SummaryWriter() writer.add_hparams(flatten_dict(cfg), {}) logger.info('Hyperparams:', cfg) # create the agent agent = DQNAgent(env, device=cfg.train.device, summary_writer=writer, cfg=cfg) n_games = 0 max_mean_40_reward = -sys.maxsize # Play MAX_N_GAMES games while n_games < cfg.train.max_episodes: # act greedly action = agent.act_eps_greedy(obs) # one step on the environment new_obs, reward, done, _ = env.step(action) # add the environment feedback to the agent agent.add_env_feedback(obs, action, new_obs, reward, done) # sample and optimize NB: the agent could wait to have enough memories agent.sample_and_optimize(cfg.train.batch_size) obs = new_obs if done: n_games += 1 agent.print_info() agent.reset_stats() obs = env.reset() if agent.rewards: current_mean_40_reward = np.mean(agent.rewards[-40:]) if current_mean_40_reward > max_mean_40_reward: agent.save_model(cfg.train.best_checkpoint) writer.close()
def main(): env = gym.make('carla-v0') state_size = env.image_size_net_chans action_size = len(env.action_space) agent = DQNAgent(state_size, action_size) done = False batch_size = 10 try: for episode in range(EPISODES): state = env.reset(render=True) score = 0.0 for time in range(10000): env.render() action = agent.act(state) next_state, reward, done = env.step(action) if done: reward = -15 else: if abs(reward) < 0.5: continue score += reward agent.remember(state, action, reward, next_state, done) state = next_state if done: agent.update_target_model() print('episode: {}/{}, score: {:.5}, e: {}'.format( episode, EPISODES, score, agent.epsilon)) break if len(agent.memory) > batch_size: agent.replay(batch_size) if episode % 10 == 0: agent.save(os.path.join('..', 'models', 'carla-ddqn.h5')) finally: env.world.destroy()
def main(): parser = argparse.ArgumentParser(description='DQN') parser.add_argument('--env', type=str, default='MsPacman-v0') # 'Breakout-v0' parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--eps', type=float, default=1.0) parser.add_argument('--exploration_decay_speed', type=int, default=1000000) parser.add_argument('--eps_min', type=float, default=0.1) parser.add_argument('--log_size', type=int, default=100) parser.add_argument('--buffer_size', type=int, default=100000) parser.add_argument('--buffer_init_size', type=int, default=50000) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--sync_period', type=int, default=10000) parser.add_argument('--learn_freq', type=int, default=4) parser.add_argument('--save_freq', type=int, default=100) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--device', type=str, default='cuda') parser.add_argument('--exp-dir', type=str, default='exp') args = parser.parse_args() args.device = torch.device(args.device if torch.cuda.is_available() \ and args.device.startswith('cuda') else 'cpu') work_dir = mkdir(args.exp_dir, args.env) # save models # logging infos logging.basicConfig(filename=args.env + '.log', filemode='w', level=logging.INFO) env = gym.make(args.env) # set seed env.seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.manual_seed(args.seed) torch.set_default_tensor_type('torch.cuda.FloatTensor') agent = DQNAgent(env, args, work_dir) agent.run()
def main(): if len(sys.argv) >1: host = sys.argv[1] epi_file=open('../files/episode.txt') episode = epi_file.readline() epi_file.close() episode = int(episode) qagent=DQNAgent(14) data = 'x' while(data!='9'): data = send_action(9) ys,ds=qagent.get_data(episode,0) state = np.concatenate((ys,ds),axis=0) for step in range(1,t_steps+1): action = qagent.get_action(state) # action = qagent.get_action(state) reward = send_action(action) ys,ds = qagent.get_data(episode,step) n_state = np.concatenate((ys,ds),axis=0) state = n_state
def main(): if len(sys.argv) > 1: host = sys.argv[1] ep_reward_file = memory_path + 'ep_reward.dat' epi_file = open('../files/episode.txt') episode = epi_file.read(1) epi_file.close() qagent = DQNAgent() data = 'x' while (data != '9'): data = send_action(9) ys, ds = qagent.get_data(episode, 0) state = np.concatenate((ys, ds), axis=0) actions = [] rewards = [] for step in range(1, t_steps + 1): action = qagent.e_get_action(state) # action = qagent.get_action(state) reward = send_action(action) ys, ds = qagent.get_data(episode, step) n_state = np.concatenate((ys, ds), axis=0) actions.append(action) rewards.append(reward) state = n_state #save-actions,rewards actions = map(str, actions) rewards = map(str, rewards) r_file = open(reward_file, 'a') a_file = open(action_file, 'a') r_str = ','.join(rewards) a_str = ','.join(actions) r_file.write(r_str + '\n') a_file.write(a_str + '\n') r_file.close() a_file.close() print("episode : ", episode, " finished.") episode = str(int(episode) + 1) epi_file = open('../files/episode.txt', 'w') epi_file.write(episode) epi_file.close()
def main(): # enable GPU memory growth physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], True) # model model_name = input("Model name -> ") model_file = input("Model file -> ") my_model = "models/{}/{}.h5".format(model_name, model_file) epsilon = float(input("Epsilon -> ")) episode_count = int(input("Episode count -> ")) print("Loading", my_model, "with epsilon", epsilon) agent = DQNAgent(my_model, float(epsilon)) # information resizeScale = (40, 30) frame_n = 3 max_cte = 4.35 # statistics score = [] rewards = [] highest_score = 0 highest_reward = 0 max_score = None # velocity max_velocity = 10.0 max_acceleration = 0.75 # steering max_steering = 0.75 steering_step = 2 * max_steering / (agent.action_space - 1) steering_table = [ i * steering_step - max_steering for i in range(agent.action_space) ] # setup donkey environment conf = { # "exe_path":"remote", "exe_path": "D:/sdsandbox/build2/donkey_sim.exe", "host": "127.0.0.1", "port": 9094, "body_style": "donkey", "body_rgb": (128, 128, 128), "car_name": "rl", "font_size": 100 } # env = gym.make("donkey-generated-roads-v0", conf=conf) env = gym.make("donkey-generated-track-v0", conf=conf) env.viewer.handler.max_cte = max_cte cv2.namedWindow("camera") start = time.time() first_start = start for e in range(episode_count): # at each episode, reset environment to starting position state = env.reset() states = np.empty((frame_n, resizeScale[1], resizeScale[0], 3)) states[0] = preprocessImage(state, resizeScale) need_frames = frame_n - 1 done = False score.append(0) rewards.append(0.0) last_velocity = [0.0] laps = 0 start = time.time() while not done and (score[-1] < max_score if max_score else True): if need_frames > 0: next_state, reward, done, info = env.step([ steering_table[random.randint(0, agent.action_space - 1)], 0.15 ]) states[frame_n - need_frames] = preprocessImage( next_state, resizeScale) need_frames -= 1 last_velocity.append(info["speed"]) continue # select action, observe environment, calculate reward action, Q = agent.act(np.asarray([states])) steering = steering_table[action] throttle = calculateThrottle(last_velocity[-1], max_velocity, max_acceleration) next_state, reward, done, info = env.step([steering, throttle]) img = cv2.resize(next_state, (320, 240), interpolation=cv2.INTER_AREA) cv2.imshow("camera", img) last_velocity.append(round(info["speed"], 4)) if abs(info["cte"]) >= max_cte: done = True reward = -1.0 # for track else: reward = (1.0 - (abs(info["cte"]) / max_cte)) # for roads # if not done: # reward = (1.0 - (abs(info["cte"]) / max_cte)); if info["lap_finished"]: laps += 1 score[-1] += 1 rewards[-1] += reward # for roads # if self.score[-1] > 1500: # laps = max_laps next_states = np.roll(states, -1, axis=0) next_states[-1] = preprocessImage(next_state, resizeScale) states = next_states cv2.waitKey(1) env.step([0.0, -0.03]) if len(score) > 20: score = score[-20:] if len(rewards) > 20: rewards = rewards[-20:] if score[-1] >= highest_score: highest_score = score[-1] if rewards[-1] >= highest_reward: highest_reward = rewards[-1] print( "episode: {}/{}, score: {}, reward: {}, laps: {}, e: {:.2}".format( e + 1, episode_count, score[-1], round(rewards[-1], 2), laps, round(agent.epsilon, 2))) if (e + 1) % 5 == 0: print("Took", round((time.time() - start) / 60, 2), "minutes\n") start = time.time() print("Showcase time:", round((time.time() - first_start) / 60, 2), "minutes")
from utils import get_args # Take argument arg = get_args() # Build env (first level, right only) env = gym_super_mario_bros.make(arg.env) env = JoypadSpace(env, RIGHT_ONLY) env = wrapper(env) # Parameters states = (84, 84, 4) actions = env.action_space.n # Agent agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True) # Episodes # episodes = 100001 episodes = 101 rewards = [] # Timing start = time.time() step = 0 # Main loop for e in range(episodes): # Reset env state = env.reset()
world = World(args.config_file, thread_num=args.thread) # create agents agents = [] #parameters['buffer_size'] = parameters['buffer_size']*len(world.intersections) #parameters['batch_size'] = parameters['batch_size']*len(world.intersections) for i in world.intersections: action_space = gym.spaces.Discrete(len(i.phases)) agents.append( DQNAgent( action_space, LaneVehicleGenerator(world, i, ["lane_count"], in_only=True, average=None, scale=.025), PressureRewardGenerator(world, i, scale=0.005, negative=True), i.id, parameters, world)) if args.load_model: agents[-1].load_model(args.save_dir) # Create metric metric = [ TravelTimeMetric(world), ThroughputMetric(world), SpeedScoreMetric(world), MaxWaitingTimeMetric(world) ]