def __init__(self, kwargs): kwargs["env_cls"] = Atari env = kwargs["env_cls"](kwargs["env_id"]) kwargs["state_shape"] = env.observation_space.shape kwargs["state_dtype"] = np.uint8 kwargs["n_actions"] = env.action_space.n kwargs["device"] = torch.device(kwargs["device_id"]) env.close() self.__dict__.update(kwargs) self.agent = DQNAgent(**kwargs) self.writer = SummaryWriter("./log/") self.cuda_eval = torch.cuda.Stream(self.device) mem_kwargs = dict( capacity=self.mem_capacity, history_len=self.history_len, state_shape=self.state_shape, state_dtype=self.state_dtype, batch_sz=self.batch_sz, alpha=self.mem_alpha, beta=LinearScheduler(self.mem_beta, 1., self.train_steps), priority_eps=self.mem_priority_eps, priority_upper=self.mem_priority_upper, prioritized_replay=self.prioritized_replay, device=self.device, ) mem_cls = PrioritizedReplayMemory if self.prioritized_replay else UniformReplayMemory self.mem = mem_cls(**mem_kwargs) self.mem_lock = Lock() self.sync = Queue(maxsize=1) self.sync.put(None)
def __init__(self, width, height, rows, window, offx, offy, idx=""): self.SETTINGS = {} self.SETTINGS['w'] = width self.SETTINGS['h'] = height self.SETTINGS['r'] = rows self.SETTINGS['sB'] = width // rows self.SETTINGS['ox'] = offx * width self.SETTINGS['oy'] = offy * height self.idx = idx self.window = window self.snake = Snake((255, 0, 0), (self.SETTINGS['r'] // 2, self.SETTINGS['r'] // 2), self.SETTINGS) self.snack = Cube(self.randomSnack(), self.SETTINGS, color=(0, 255, 0)) self.dist = self.get_snack_distance() self.walls = self.get_wall_pos() self.model = Model(len(self.get_observation()), 4) self.tgt = Model(len(self.get_observation()), 4) self.agent = DQNAgent(self.model, self.tgt) self.reward = 0.0 self.setp_reward = 0.0 self.rewards = [] self.finished = False self.points = 0 self.points_ls = []
def main(): USE_CUDA = torch.cuda.is_available() env = gym.make('CartPole-v0') dqn = DQN(env.observation_space.shape[0], env.action_space.n) if USE_CUDA: dqn = dqn.cuda() optimizer = optim.RMSprop(dqn.parameters(), lr=0.00025, momentum=0.95, alpha=0.95, eps=0.01) epsilon_schedule = get_epsilon_schedule(start=1.0, end=0.01, endt=1000, learn_start=50) replay_buffer = ReplayBuffer(capacity=1000) agent = DQNAgent(env, dqn, optimizer, epsilon_schedule, replay_buffer, discount_factor=0.99, target_update_rate=10, batch_size=32, learn_start=50) agent.train(5000) total_reward = agent.play(render=True) agent.env.close() print('Total Reward: ', total_reward)
def td_learning(args): agent = DQNAgent(args) replay_memory = PrioritizedReplayBuffer(1000000, args.alpha) #eval_game(agent, 500) outer = tqdm(range(args.total_steps), desc='Total steps', position=0) game = init_game() ave_score = 0 count = 0 for step in outer: board = copy.deepcopy(game.gameboard.board) if step < args.start_learn: avail_choices = game.gameboard.get_available_choices() index = np.random.randint(len(avail_choices)) choice = avail_choices[index] else: choice = agent.greedy_policy( board, game.gameboard.get_available_choices()) next_board, reward = game.input_pos(choice[0], choice[1]) next_board = copy.deepcopy(next_board) ##### replay_memory.add(board, choice, reward, next_board) ##### if game.termination(): ave_score += game.gameboard.score count += 1 game = init_game() if step >= args.start_learn and step % args.train_freq == 0: if count > 0: message = "ave score of " + str(count) + " game: " + str( ave_score / count) out_fd.write("{} {}\n".format(step, ave_score / count)) outer.write(message) ave_score = 0 count = 0 if step == args.start_learn: experience = replay_memory.sample(args.start_learn, beta=agent.beta) else: experience = replay_memory.sample(args.train_data_size, beta=agent.beta) boards, choices, rewards, next_boards, weights, batch_idxes = experience td_errors = agent.train( (boards, choices, rewards, next_boards, weights)) new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_memory.update_priorities(batch_idxes, new_priorities) agent.update_target(args.soft_tau) agent.update_epsilon() agent.update_beta() eval_game(agent, 500) out_fd.close()
def main(): epi_file = open('../files/episode.txt') episode = epi_file.readline() epi_file.close() episode = int(episode) - 1 qagent = DQNAgent(episode - 1) qagent.load_memory_of_episode(episode) qys = [] qds = [] for k in range(50): for j in range(5): # for i in range(0,len(qagent.memory),qagent.batch_size): qy, qd = qagent.memory_replay() qagent.update_targer_model() qys.append(qy) qds.append(qd) qagent.save_model(episode) res = time.strftime('%Y/%m/%d-%H:%M:%S', time.localtime( time.time())) + "Average of episode: %d Q_y: %f Q_d: %f" % ( episode, np.mean(qys), np.mean(qds)) epi_file = open('../files/avg_Q.txt', 'a') epi_file.write(res + '\n') epi_file.close() if forward: epi_file = open('../files/episode.txt', 'w') epi_file.write(str(episode + 2)) epi_file.close()
def test_target_model(): agent = DQNAgent() agent.load('models/model.h5') state = np.zeros([6, 7]) state[5][3] = 1 state = state.reshape(1, 6, 7, 1) p1 = agent.policy_model.predict(state) p2 = agent.target_model.predict(state) print(p1) print(p2) if not np.array_equal(p1, p2): print('FAIL')
def dqn_run(episodes=2500, eps_start=1.0, eps_end=0.01, eps_decay=0.995, double_dqn=False, dueling_dqn=False, seed=42): env = start_env() env_info = reset_env_info(env) state_size = get_state_size(env_info) action_size = get_action_size(env) print('Seed used:', seed) agent = DQNAgent(state_size, action_size, double_dqn, dueling_dqn, seed) scores = [] scores_window = deque(maxlen=100) eps = eps_start for episode in range(1, episodes + 1): env_info = reset_env_info(env) score = 0.0 done = False while not done: state = env_info.vector_observations[0] action = agent.act(state, epsilon=eps) env_info = env_step(env, action) next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] agent.step(state, action, reward, next_state, done) score += reward scores_window.append(score) scores.append(score) eps = max(eps * eps_decay, eps_end) print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'.format( episode, episodes, np.mean(scores_window), eps), end=' ') if episode % 100 == 0: print('\rEpisode {}/{}\tAverage Score: {:.2f}, epsilon: {:.3f}'. format(episode, episodes, np.mean(scores_window), eps)) if np.mean(scores_window) > 13.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth') break env.close() return scores
def test_model(filename): env = gym.make("CartPole-v1") agent = DQNAgent(4, 2) agent.load_model(filename) state = env.reset() for _ in range(1000): env.render() state, _, done, _ = env.step(agent.act(state, explore=False)) if done: break env.close()
def __init__(self): pygame.init() self.window = pygame.display.set_mode((500, 800)) pygame.display.set_caption("Racing AI") self.clock = pygame.time.Clock() self.execute = True self.car = Car(250, 650, self.window) self.agent = DQNAgent(inputs=4, n_actions=2) self.episode_durations = [] self.update_agent = pygame.USEREVENT + 1 update_timer = 100 pygame.time.set_timer(self.update_agent, update_timer)
def test(agent: DQNAgent, test_eps): env = gym.make(ENV_NAME) ep_rewards = [] for test_ep in range(test_eps): obs = env.reset() done = False ep_reward = 0 ep_step = 0 while not done: action = agent.act(np.array(obs), evaluate=True) next_obs, reward, done, _ = env.step(action) env.render() obs = next_obs ep_reward += reward ep_step += 1 ep_rewards.append(ep_reward) time.sleep(0.2) print('\n') print('=== Test performance ===') print(f'Mean: {np.mean(ep_rewards):.1f} / ' f'Min: {np.min(ep_rewards):.1f} / ' f'Max: {np.max(ep_rewards):.1f}') env.close() return ep_rewards
def play(**kwargs): env = BananaEnvironment(file_name=kwargs['env_file'], num_stacked_frames=kwargs['num_stacked_frames']) agent_name = kwargs['agent_fname'] is_per = 'PER' in agent_name if 'ddqn' in agent_name: agent = DDQNAgentPER.load(agent_name) if is_per else DDQNAgent.load( agent_name) elif 'dqn' in agent_name: agent = DQNAgentPER.load(agent_name) if is_per else DQNAgent.load( agent_name) else: raise KeyError('Unknown agent type') for i in range(kwargs['num_plays']): done = False score = 0 state = env.reset(train_mode=False) while not done: action = agent.act(state, eps=0.) state, reward, done = env.step(action) # roll out transition score += reward print("\r play #{}, reward: {} | score: {}".format( i + 1, reward, score), end='') print()
def run(novis, env_dir, env_file, n_episodes, seed, prioritized, cpu): if novis: env_dir = "{}_NoVis".format(env_dir) env = UnityEnvironment(file_name="environments/{}/{}".format(env_dir, env_file)) # get default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment # print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size # print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] # print('States look like:', state) state_size = len(state) # print('States have length:', state_size) report = Report(DQNAgent(state_size=state_size, action_size=action_size, seed=seed, prioritized=prioritized, cpu=cpu)).run(dqn, env=env, brain_name=brain_name, n_episodes=n_episodes) print(report)
def purge_round(): candidate_leaders_map = {} # {filename --> agent} # Load in all of the leaders for leader_checkpoint in os.listdir(LEADER_DIR): path = os.path.join(LEADER_DIR, leader_checkpoint) candidate_leader = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=leader_checkpoint)) candidate_leader.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) candidate_leaders_map[leader_checkpoint] = candidate_leader candidate_scores = [] # list[(filename, score)] filenames, candidate_leaders = zip(*candidate_leaders_map.items()) for i, (filename, candidate_leader) in enumerate(zip(filenames, candidate_leaders)): print "EVALUATING {}".format(candidate_leader.name) leaders = EnsembleDQNAgent(candidate_leaders[:i] + candidate_leaders[i + 1:]) candidate_scores.append((filename, evaluate(candidate_leader, leaders, EPISODES_EVALUATE_PURGE))) sorted_scores = sorted(candidate_scores, key=lambda x: x[1], reverse=True) print "SCORES: {}".format(sorted_scores) for filename, score in sorted_scores[NUM_LEADERS:]: print "PURGING ({}, {})".format(filename, score) leader_path = os.path.join(LEADER_DIR, filename) graveyard_path = os.path.join(GRAVEYARD_DIR, filename) os.rename(leader_path, graveyard_path)
def advise(): n1 = float(request.form['n1']) n2 = float(request.form['n2']) n3 = float(request.form['n3']) cash = float(request.form['cash']) print(n1) print(cash) agent = DQNAgent(state_size, action_size) scaler = get_scaler(env) agent.load("202005011635-dqn.h5") state = env.reset() state[0] = n1 state[1] = n2 state[2] = n3 state[-1] = cash state = scaler.transform([state]) action = agent.act(state) # action_combo = list(map(list, itertools.product([0, 1, 2], repeat=3))) action_vec = action_combo[action] # action_map = {0: "sell", 1: "hold", 2: "buy"} # print(action_map[action_vec[0]], action_map[action_vec[1]], action_map[action_vec[2]]) ans = [] tmp = 1 if action_vec[0] == 0 and n1 == 0 else action_vec[0] if cash == 0 and tmp == 2: tmp = 1 ans.append(action_map[tmp]) tmp = 1 if action_vec[1] == 0 and n2 == 0 else action_vec[1] if cash == 0 and tmp == 2: tmp = 1 ans.append(action_map[tmp]) tmp = 1 if action_vec[2] == 0 and n3 == 0 else action_vec[2] if cash == 0 and tmp == 2: tmp = 1 ans.append(action_map[tmp]) print(ans) return render_template('index.html', ans=ans, n1=n1, n2=n2, n3=n3, cash=cash)
def main(): parser = argparse.ArgumentParser(description='DQN') parser.add_argument('--env', type=str, default='MsPacman-v0') # 'Breakout-v0' parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--eps', type=float, default=1.0) parser.add_argument('--exploration_decay_speed', type=int, default=1000000) parser.add_argument('--eps_min', type=float, default=0.1) parser.add_argument('--log_size', type=int, default=100) parser.add_argument('--buffer_size', type=int, default=100000) parser.add_argument('--buffer_init_size', type=int, default=50000) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--sync_period', type=int, default=10000) parser.add_argument('--learn_freq', type=int, default=4) parser.add_argument('--save_freq', type=int, default=100) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--device', type=str, default='cuda') parser.add_argument('--exp-dir', type=str, default='exp') args = parser.parse_args() args.device = torch.device(args.device if torch.cuda.is_available() \ and args.device.startswith('cuda') else 'cpu') work_dir = mkdir(args.exp_dir, args.env) # save models # logging infos logging.basicConfig(filename=args.env + '.log', filemode='w', level=logging.INFO) env = gym.make(args.env) # set seed env.seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.manual_seed(args.seed) torch.set_default_tensor_type('torch.cuda.FloatTensor') agent = DQNAgent(env, args, work_dir) agent.run()
def main(): if len(sys.argv) >1: host = sys.argv[1] epi_file=open('../files/episode.txt') episode = epi_file.readline() epi_file.close() episode = int(episode) qagent=DQNAgent(14) data = 'x' while(data!='9'): data = send_action(9) ys,ds=qagent.get_data(episode,0) state = np.concatenate((ys,ds),axis=0) for step in range(1,t_steps+1): action = qagent.get_action(state) # action = qagent.get_action(state) reward = send_action(action) ys,ds = qagent.get_data(episode,step) n_state = np.concatenate((ys,ds),axis=0) state = n_state
def main(argv): # Pretrained network to use inputfile = None # Wether to train or to test train = False # Trained network outputfile = None try: opts, args = getopt.getopt(argv, "hrl:s:", ["loadckpt=", "saveckpt="]) except getopt.GetoptError: print 'Incorrect usage. For more information: test.py -h' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'python test.py -r -l <ckptfile> -s <ckptfile>' print '-r for enabling training' print '-l for loading pre-existing model' print '-s for saving model to file' sys.exit() elif opt == '-r': train = True elif opt in ("-l", "--loadckpt"): inputfile = arg elif opt in ("-s", "--saveckpt"): outputfile = arg with tf.Session() as sess: env = Environment() agent = DQNAgent(env, sess, inputfile) if train: agent.train(6000000, outputfile) else: agent.test(2000)
def __init__(self, env_creator, device, buffer_size, save_dir, timesteps_per_epoch=1, batch_size=32, total_steps=5 * 10 ** 5, decay_rate=0.1, init_epsilon=1, final_epsilon=0.02, loss_freq=50, refresh_target_network_freq=500, eval_freq=500, max_grad_norm=50): self.env_creator = env_creator self.env = env_creator() n_actions = self.env.action_space.n state_shape = self.env.observation_space.shape self.save_dir = save_dir self.buffer_size = buffer_size self.timesteps_per_epoch = timesteps_per_epoch self.batch_size = batch_size self.total_steps = total_steps self.decay_steps = decay_rate * total_steps self.init_epsilon = init_epsilon self.final_epsilon = final_epsilon self.loss_freq = loss_freq self.refresh_target_network_freq = refresh_target_network_freq self.eval_freq = eval_freq self.max_grad_norm = max_grad_norm self.device = device self.writer = SummaryWriter('runs') self.agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device) self.target_network = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device) self.target_network.load_state_dict(self.agent.state_dict())
def main(): if len(sys.argv) > 1: host = sys.argv[1] ep_reward_file = memory_path + 'ep_reward.dat' epi_file = open('../files/episode.txt') episode = epi_file.read(1) epi_file.close() qagent = DQNAgent() data = 'x' while (data != '9'): data = send_action(9) ys, ds = qagent.get_data(episode, 0) state = np.concatenate((ys, ds), axis=0) actions = [] rewards = [] for step in range(1, t_steps + 1): action = qagent.e_get_action(state) # action = qagent.get_action(state) reward = send_action(action) ys, ds = qagent.get_data(episode, step) n_state = np.concatenate((ys, ds), axis=0) actions.append(action) rewards.append(reward) state = n_state #save-actions,rewards actions = map(str, actions) rewards = map(str, rewards) r_file = open(reward_file, 'a') a_file = open(action_file, 'a') r_str = ','.join(rewards) a_str = ','.join(actions) r_file.write(r_str + '\n') a_file.write(a_str + '\n') r_file.close() a_file.close() print("episode : ", episode, " finished.") episode = str(int(episode) + 1) epi_file = open('../files/episode.txt', 'w') epi_file.write(episode) epi_file.close()
def eval(): env = Tetris() max_steps = None epsilon_stop_episode = 1500 mem_size = 20000 discount = 0.95 batch_size = 512 epochs = 1 replay_start_size = 2000 n_neurons = [32, 32] render_delay = None activations = ['relu', 'relu', 'linear'] agent = DQNAgent(env.get_state_size(), n_neurons=n_neurons, activations=activations, epsilon=0, epsilon_stop_episode=epsilon_stop_episode, mem_size=mem_size, discount=discount, replay_start_size=replay_start_size, train=False) agent.load("ckpts/591_model.ckpt") current_state = env.reset() done = False steps = 0 # Game while not done and (not max_steps or steps < max_steps): next_states = env.get_next_states() best_state = agent.best_state(next_states.values()) best_action = None for action, state in next_states.items(): if state == best_state: best_action = action break reward, done = env.play(best_action[0], best_action[1], render=True, render_delay=render_delay) agent.add_to_memory(current_state, next_states[best_action], reward, done) current_state = next_states[best_action] steps += 1
def DQN(episodes, epsilon, epsilonDeca): env = Env() agent = DQNAgent() #window=pygame.display.set_mode((windowWidth,windowHeight)) episodeRewards = [] for episode in range(episodes): episode_reward = 0 step = 1 current_state = env.reset() done = False while not done: # This part stays mostly the same, the change is to query a model for Q values if np.random.random() > epsilon: # Get action from Q table action = np.argmax(agent.getQs(np.array(current_state))) else: # Get random action action = np.random.randint(0, env.ACTION_SPACE_SIZE) new_state, reward, done = env.step(action) episode_reward += reward #drawWindow(window,[env.blob,env.enemyBlob],[env.ball],env.wall) # Every step we update replay memory and train main network agent.updateReplyMemory( (current_state, action, reward, new_state, done)) agent.train(done, step) current_state = new_state step += 1 episodeRewards.append(episode_reward) if episode % 10 == 0: averageReward = sum(episodeRewards) / len(episodeRewards) minReward = min(episodeRewards) maxReward = max(episodeRewards) print( f"replayMemo:{len(agent.replayMemory)} avg:{averageReward} \n min:{minReward} \n max:{maxReward} " ) if epsilon > MIN_EPSILON: epsilon *= EPSILON_DECAY epsilon = max(MIN_EPSILON, epsilon) pygame.quit()
def main(): # enable GPU memory growth physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], True) # model model_name = input("Model name -> ") model_file = input("Model file -> ") my_model = "models/{}/{}.h5".format(model_name, model_file) epsilon = float(input("Epsilon -> ")) episode_count = int(input("Episode count -> ")) print("Loading", my_model, "with epsilon", epsilon) agent = DQNAgent(my_model, float(epsilon)) # information resizeScale = (40, 30) frame_n = 3 max_cte = 4.35 # statistics score = [] rewards = [] highest_score = 0 highest_reward = 0 max_score = None # velocity max_velocity = 10.0 max_acceleration = 0.75 # steering max_steering = 0.75 steering_step = 2 * max_steering / (agent.action_space - 1) steering_table = [ i * steering_step - max_steering for i in range(agent.action_space) ] # setup donkey environment conf = { # "exe_path":"remote", "exe_path": "D:/sdsandbox/build2/donkey_sim.exe", "host": "127.0.0.1", "port": 9094, "body_style": "donkey", "body_rgb": (128, 128, 128), "car_name": "rl", "font_size": 100 } # env = gym.make("donkey-generated-roads-v0", conf=conf) env = gym.make("donkey-generated-track-v0", conf=conf) env.viewer.handler.max_cte = max_cte cv2.namedWindow("camera") start = time.time() first_start = start for e in range(episode_count): # at each episode, reset environment to starting position state = env.reset() states = np.empty((frame_n, resizeScale[1], resizeScale[0], 3)) states[0] = preprocessImage(state, resizeScale) need_frames = frame_n - 1 done = False score.append(0) rewards.append(0.0) last_velocity = [0.0] laps = 0 start = time.time() while not done and (score[-1] < max_score if max_score else True): if need_frames > 0: next_state, reward, done, info = env.step([ steering_table[random.randint(0, agent.action_space - 1)], 0.15 ]) states[frame_n - need_frames] = preprocessImage( next_state, resizeScale) need_frames -= 1 last_velocity.append(info["speed"]) continue # select action, observe environment, calculate reward action, Q = agent.act(np.asarray([states])) steering = steering_table[action] throttle = calculateThrottle(last_velocity[-1], max_velocity, max_acceleration) next_state, reward, done, info = env.step([steering, throttle]) img = cv2.resize(next_state, (320, 240), interpolation=cv2.INTER_AREA) cv2.imshow("camera", img) last_velocity.append(round(info["speed"], 4)) if abs(info["cte"]) >= max_cte: done = True reward = -1.0 # for track else: reward = (1.0 - (abs(info["cte"]) / max_cte)) # for roads # if not done: # reward = (1.0 - (abs(info["cte"]) / max_cte)); if info["lap_finished"]: laps += 1 score[-1] += 1 rewards[-1] += reward # for roads # if self.score[-1] > 1500: # laps = max_laps next_states = np.roll(states, -1, axis=0) next_states[-1] = preprocessImage(next_state, resizeScale) states = next_states cv2.waitKey(1) env.step([0.0, -0.03]) if len(score) > 20: score = score[-20:] if len(rewards) > 20: rewards = rewards[-20:] if score[-1] >= highest_score: highest_score = score[-1] if rewards[-1] >= highest_reward: highest_reward = rewards[-1] print( "episode: {}/{}, score: {}, reward: {}, laps: {}, e: {:.2}".format( e + 1, episode_count, score[-1], round(rewards[-1], 2), laps, round(agent.epsilon, 2))) if (e + 1) % 5 == 0: print("Took", round((time.time() - start) / 60, 2), "minutes\n") start = time.time() print("Showcase time:", round((time.time() - first_start) / 60, 2), "minutes")
from utils import get_args # Take argument arg = get_args() # Build env (first level, right only) env = gym_super_mario_bros.make(arg.env) env = JoypadSpace(env, RIGHT_ONLY) env = wrapper(env) # Parameters states = (84, 84, 4) actions = env.action_space.n # Agent agent = DQNAgent(states=states, actions=actions, max_memory=100000, double_q=True) # Episodes # episodes = 100001 episodes = 101 rewards = [] # Timing start = time.time() step = 0 # Main loop for e in range(episodes): # Reset env state = env.reset()
def main(): print("Creating model...") model = modelutils.create_model(number_of_actions=4) model.summary() print("Creating agent...") if agent_type == "dqn": agent = DQNAgent(name="doom-dqn", model=model, number_of_actions=4, gamma=0.99, final_epsilon=0.0001, initial_epsilon=0.1, number_of_iterations=200000, replay_memory_size=10000, minibatch_size=32) elif agent_type == "ddqn": agent = DDQNAgent(name="doom-ddqn", model=model, number_of_actions=4, gamma=0.99, final_epsilon=0.0001, initial_epsilon=0.1, number_of_iterations=200000, replay_memory_size=10000, minibatch_size=32, model_copy_interval=100) agent.enable_rewards_tracking(rewards_running_means_length=1000) agent.enable_episodes_tracking(episodes_running_means_length=1000) agent.enable_maxq_tracking(maxq_running_means_length=1000) agent.enable_model_saving(model_save_frequency=10000) agent.enable_plots_saving(plots_save_frequency=10000) print("Creating game...") #environment = Environment(headless=("headless" in sys.argv)) # Create an instance of the Doom game. environment = DoomGame() environment.load_config("scenarios/basic.cfg") environment.set_screen_format(ScreenFormat.GRAY8) environment.set_window_visible("headless" not in sys.argv) environment.init() print("Training ...") train(agent, environment, verbose="verbose" in sys.argv)
world = World(args.config_file, thread_num=args.thread) # create agents agents = [] #parameters['buffer_size'] = parameters['buffer_size']*len(world.intersections) #parameters['batch_size'] = parameters['batch_size']*len(world.intersections) for i in world.intersections: action_space = gym.spaces.Discrete(len(i.phases)) agents.append( DQNAgent( action_space, LaneVehicleGenerator(world, i, ["lane_count"], in_only=True, average=None, scale=.025), PressureRewardGenerator(world, i, scale=0.005, negative=True), i.id, parameters, world)) if args.load_model: agents[-1].load_model(args.save_dir) # Create metric metric = [ TravelTimeMetric(world), ThroughputMetric(world), SpeedScoreMetric(world), MaxWaitingTimeMetric(world) ]
import numpy as np from agent import DQNAgent from utils import make_env if __name__ == "__main__": env = make_env('PongNoFrameskip-v4') best_score = -np.inf n_games = 200 agent = DQNAgent(gamma=0.99, epsilon=1.0, lr=1e-4, n_actions=env.action_space.n, input_dims=(env.observation_space.shape), mem_size=50000, batch_size=32, eps_min=0.1, eps_dec=1e-5, tau=1000, env_name='PongNoFrameskip-v4', chkpt_dir='models/') n_steps = 0 scores, eps_history = [], [] for i in range(n_games): done = False state = env.reset() score = 0 while not done: action = agent.choose_action(state)
def main(config, max_num_of_steps, max_num_of_episodes, load_model, save_model, load_memory, save_memory, log_path): agent = DQNAgent(config) with agent.graph.as_default(): if load_model: step = agent.load_model(load_model) screen_log.info("Load model: {}".format(load_model)) screen_log.info("Start from step {}".format(step)) else: step = 0 if load_memory: agent.load_memory(load_memory) n_frames = len(agent.memory) screen_log.info("Load memory: {}".format(load_memory)) screen_log.info("Memory size: {}".format(n_frames)) log_name = ('{:02}{:02}{:02}{:02}{:02}'.format(*time.localtime()[1:6])) summary_writer = tf.summary.FileWriter(logdir=os.path.join( log_path, '{}'.format(log_name)), graph=agent.graph) episode = 0 rewards_per_episode = [] sum_Qs = .0 sum_losses = .0 try: while (step < max_num_of_steps and episode < max_num_of_episodes): episode += 1 episode_done = False next_observation = reset_random_env() next_observation = preprocess_observation(next_observation) rewards_per_episode.append(0) while not episode_done: observation = next_observation if len(agent.memory) < config['replay_start_size']: # init replay memory action = env.action_space.sample() next_observation, reward, episode_done, info = env.step( action) next_observation = preprocess_observation( next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) continue state = agent.get_recent_state(observation) Qs = agent.get_Q_values(state) Qs = Qs[0] # epsilon-greedy action selection epsilon = get_epsilon(config, step) if np.random.RandomState().rand() < epsilon: action = env.action_space.sample() else: action = agent.get_action_from_Q(Qs) next_observation, reward, episode_done, info = env.step( action) next_observation = preprocess_observation(next_observation) agent.memory.append( MemoryItem(observation, action, reward, episode_done, info)) step += 1 rewards_per_episode[-1] += reward sum_Qs += Qs[action] # train step loss, loss_summary_str = agent.optimize_Q() summary_writer.add_summary(loss_summary_str, step) sum_losses += loss if step % 1000 == 0: ave_loss = sum_losses / step ave_reward = np.mean(rewards_per_episode) ave_Q = sum_Qs / step [Q_summary_str, reward_summary_str ] = agent.evaluate(ave_reward, ave_Q) summary_writer.add_summary(Q_summary_str, step) summary_writer.add_summary(reward_summary_str, step) screen_log.info( 'step: {}, ave. loss: {:g}, ' 'ave. reward: {:g}, ave. Q: {:g}'.format( step, ave_loss, ave_reward, ave_Q, )) if step % 10000 == 0: agent.save_model(save_model, step) if step % 1000000 == 0: agent.save_memory(save_memory, step) except KeyboardInterrupt: print("\nUser interrupted training...") finally: summary_writer.close() agent.save_model(save_model, step) agent.save_memory(save_memory, step) screen_log.info( 'Finished: the number of steps {}, the number of episodes {}.'. format(step, episode))
maybe_make_dir('weights') maybe_make_dir('portfolio_val') timestamp = time.strftime('%Y%m%d%H%M') data = np.around(get_data()) data_size = data.shape[1] data_cut_point = int(0.75*data_size) train_data = data[:, :data_cut_point] test_data = data[:, data_cut_point:] env = TradingEnv(train_data, args.initial_invest) state_size = env.observation_space.shape action_size = env.action_space.n agent = DQNAgent(state_size, action_size) scaler = get_scaler(env) portfolio_value = [] if args.mode == 'test': # remake the env with test data env = TradingEnv(test_data, args.initial_invest) # load trained weights agent.load(args.weights) # when test, the timestamp is same as time when weights was trained timestamp = re.findall(r'\d{12}', args.weights)[0] for e in range(args.episode): state = env.reset() state = scaler.transform([state])
def main(): # noqa: D103 parser = argparse.ArgumentParser(description="Run DQN on iLOCuS") parser.add_argument("--network_name", default="deep_q_network", type=str, help="Type of model to use") parser.add_argument("--batch_size", default=32, type=int, help="Batch size") parser.add_argument("--map_shape", default=(15, 15), type=tuple, help="map size") parser.add_argument("--num_actions", default=4, type=int, help="level of pricing") parser.add_argument("--gamma", default=0.8, type=float, help="Discount factor") parser.add_argument("--alpha", default=0.0001, type=float, help="Learning rate") parser.add_argument("--epsilon", default=0.5, type=float, help="Exploration probability for epsilon-greedy") parser.add_argument("--target_update_freq", default=10000, type=int, help="Frequency for copying weights to target network") parser.add_argument( "--num_iterations", default=5000000, type=int, help="Number of overal interactions to the environment") parser.add_argument("--max_episode_length", default=200000, type=int, help="Terminate earlier for one episode") parser.add_argument("--train_freq", default=4, type=int, help="Frequency for training") parser.add_argument("--num-burn-in", default=10000, type=int, help="number of memory before train") parser.add_argument("-o", "--output", default="ilocus-v0", type=str, help="Directory to save data to") parser.add_argument("--seed", default=0, type=int, help="Random seed") parser.add_argument("--train", default=True, type=bool, help="Train/Evaluate, set True if train the model") parser.add_argument("--model_path", default="atari-v0", type=str, help="specify model path to evaluation") parser.add_argument("--max_grad", default=1.0, type=float, help="Parameter for huber loss") parser.add_argument("--log_dir", default="log", type=str, help="specify log folder to save evaluate result") parser.add_argument( "--flip_coin", default=False, type=str, help="specify whether or not choosing double q learning") parser.add_argument("--eval_num", default=100, type=int, help="number of evaluation to run") parser.add_argument("--save_freq", default=100000, type=int, help="model save frequency") # memory related args parser.add_argument("--buffer_size", default=100000, type=int, help="reply memory buffer size") parser.add_argument( "--look_back_steps", default=4, type=int, help="how many previous pricing tables will be fed into RL") args = parser.parse_args() print("\nParameters:") for arg in vars(args): print(arg, getattr(args, arg)) # Initiating policy for both tasks (training and evaluating) policy = LinearDecayGreedyEpsilonPolicy(args.epsilon, 0.1, 1000000, args.num_actions) if not args.train: '''Evaluate the model''' # check model path if args.model_path is '': print("Model path must be set when evaluate") exit(1) # specific log file to save result log_file = os.path.join(args.log_dir, args.network_name, str(args.model_num)) model_dir = os.path.join(args.model_path, args.network_name, str(args.model_num)) with tf.Session() as sess: # load model # with open(model_dir + ".json", 'r') as json_file: # loaded_model_json = json_file.read() # q_network_online = model_from_json(loaded_model_json) # q_network_target = model_from_json(loaded_model_json) # # sess.run(tf.global_variables_initializer()) # # # load weights into model # q_network_online.load_weights(model_dir + ".h5") # q_network_target.load_weights(model_dir + ".h5") driver_sim = DriverSim() env = Environment(driver_sim=driver_sim) memory = ReplayMemory(args.buffer_size, args.look_back_steps) q_network = create_model(args.look_back_steps, args.map_shape, args.num_actions) dqn_agent = DQNAgent(q_network=q_network, memory=memory, policy=policy, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size) exit(0) '''Train the model''' with tf.Session() as sess: # with tf.device('/cpu:0'): print("created model") driver_sim = DriverSim() env = Environment(driver_sim=driver_sim) print("set up environment") # # create output dir, meant to pop up error when dir exist to avoid over written # os.mkdir(args.output + "/" + args.network_name) memory = ReplayMemory(args.buffer_size, args.look_back_steps) q_network = create_model(args.look_back_steps, args.map_shape, args.num_actions) dqn_agent = DQNAgent(q_network=q_network, memory=memory, policy=policy, gamma=args.gamma, target_update_freq=args.target_update_freq, num_burn_in=args.num_burn_in, train_freq=args.train_freq, batch_size=args.batch_size) print("defined dqn agent") optimizer = Adam(learning_rate=args.alpha) q_network.compile(optimizer, mean_huber_loss) sess.run(tf.global_variables_initializer()) print("initializing environment") env.reset() print("in fit") if os.path.exists(args.output): shutil.rmtree(args.output) os.mkdir(args.output) dqn_agent.fit(env=env, num_iterations=args.num_iterations, output_dir=os.path.join(args.output), max_episode_length=args.max_episode_length)
# Shift slowly from exploration to exploitation eps_decay = 0.0005 # Never move to full exploitation, leave some time for exploration eps_end = 0.998 # Define some variables to keep track of training progress # Empty dict for all agents action_dict = dict() # Score for all rewards score = 0 # ------------------------------------------------------ # 4. Load the agent # ------------------------------------------------------ # Load the agent agent = DQNAgent(state_size=state_size, action_size=action_size) # Load the weights (if pretrained agent) # agent.load("run-003.ckpt") # agent.q_act.set_weights(agent.q_learn.get_weights()) # ------------------------------------------------------ # 5. Main training loop # ------------------------------------------------------ for trial in range(1, n_trials + 1): # Reset the environment obs = env.reset() obs = obs[0] env_renderer.reset()
from flask.ext.socketio import SocketIO, emit app = Flask(__name__) app.config["SECRET_KEY"] = "secret!" socketio = SocketIO(app) this_dir = os.path.abspath(os.path.dirname(__file__)) deepy_dir = os.path.abspath(this_dir + os.sep + ".." + os.sep + "..") model_path = this_dir + os.sep + "models" + os.sep + "puckworld_model1.gz" import sys sys.path.append(deepy_dir) from agent import DQNAgent agent = DQNAgent(8, 5) if os.path.exists(model_path): print "Load model:", model_path agent.load(model_path) @app.route("/") def index(): return render_template_string(open(this_dir + os.sep + "test.html").read()) @socketio.on("act", namespace="/test") def test_action(message): action = agent.action(message["state"]) emit("act", {"action": action})