def test_model(model_path, max_steps): dqn = DQN() env = gym.make("MsPacman-v0") X_state = tf.placeholder( tf.float32, shape=[None, input_height, input_width, input_channels]) online_q_values, online_vars = dqn.create_model(X_state, "qnetwork_online") saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, model_path) obs = env.reset() for step in range(max_steps): state = preprocess_observation(obs) # evaluates what to do q_values = online_q_values.eval(feed_dict={X_state: [state]}) action = np.argmax(q_values) # plays the game obs, reward, done, info = env.step(action) env.render() time.sleep(0.05) if done: break env.close()
def eval(cfg, saved_model_path=SAVED_MODEL_PATH): print('start to eval ! \n') device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # 检测gpu env = gym.make('CartPole-v0').unwrapped # 可google为什么unwrapped gym,此处一般不需要 env.seed(1) # 设置env随机种子 n_states = env.observation_space.shape[0] n_actions = env.action_space.n agent = DQN(n_states=n_states, n_actions=n_actions, device="cpu", gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size) agent.load_model(saved_model_path + 'checkpoint.pth') rewards = [] moving_average_rewards = [] ep_steps = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE writer = SummaryWriter(log_dir) for i_episode in range(1, cfg.eval_eps + 1): state = env.reset() # reset环境状态 ep_reward = 0 for i_step in range(1, cfg.eval_steps + 1): action = agent.choose_action(state, train=False) # 根据当前环境state选择action next_state, reward, done, _ = env.step(action) # 更新环境参数 ep_reward += reward state = next_state # 跳转到下一个状态 if done: break print('Episode:', i_episode, ' Reward: %i' % int(ep_reward), 'n_steps:', i_step, 'done: ', done) ep_steps.append(i_step) rewards.append(ep_reward) # 计算滑动窗口的reward if i_episode == 1: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append(0.9 * moving_average_rewards[-1] + 0.1 * ep_reward) writer.add_scalars('rewards', { 'raw': rewards[-1], 'moving_average': moving_average_rewards[-1] }, i_episode) writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode) writer.close() '''存储reward等相关结果''' save_results(rewards, moving_average_rewards, ep_steps, tag='eval', result_path=RESULT_PATH) print('Complete evaling!')
def run_dqn(experiment_name): current_dir = pathlib.Path().absolute() directories = Save_paths(data_dir=f'{current_dir}/data', experiment_name=experiment_name) game = Winter_is_coming(setup=PARAMS['setup']) environment = wrappers.SinglePrecisionWrapper(game) spec = specs.make_environment_spec(environment) # Build the network. def _make_network(spec) -> snt.Module: network = snt.Sequential([ snt.Flatten(), snt.nets.MLP([50, 50, spec.actions.num_values]), ]) tf2_utils.create_variables(network, [spec.observations]) return network network = _make_network(spec) # Setup the logger if neptune_enabled: agent_logger = NeptuneLogger(label='DQN agent', time_delta=0.1) loop_logger = NeptuneLogger(label='Environment loop', time_delta=0.1) PARAMS['network'] = f'{network}' neptune.init('cvasquez/sandbox') neptune.create_experiment(name=experiment_name, params=PARAMS) else: agent_logger = loggers.TerminalLogger('DQN agent', time_delta=1.) loop_logger = loggers.TerminalLogger('Environment loop', time_delta=1.) # Build the agent agent = DQN( environment_spec=spec, network=network, params=PARAMS, checkpoint=True, paths=directories, logger=agent_logger ) # Try running the environment loop. We have no assertions here because all # we care about is that the agent runs without raising any errors. loop = acme.EnvironmentLoop(environment, agent, logger=loop_logger) loop.run(num_episodes=PARAMS['num_episodes']) last_checkpoint_path = agent.save() # Upload last checkpoint if neptune_upload_checkpoint and last_checkpoint_path: files = os.listdir(last_checkpoint_path) for f in files: neptune.log_artifact(os.path.join(last_checkpoint_path, f)) if neptune_enabled: neptune.stop() do_example_run(game,agent)
def display(env): if env == "MountainCar-v0": agent = DQN(2, 3, eps_max=0, load_path="models/" + env + "_model.h5") play(env, agent, train=False, render=True, episodes=1) else: agent = DQN(state_dim=(210, 160, 3), n_actions=14, eps_max=0, load_path="models/" + env + "_model.h5") play(env, agent, train=False, render=True, episodes=1)
def play(game): agent = DQN(game, use_saved=True) for i in tqdm(range(PLAY_GAMES)): game.new_episode() done = False while not done: state = game.get_state() img = state.screen_buffer action = agent.act(img) print(action) game.make_action(action) done = game.is_episode_finished()
def train(cfg): print('Start to train ! \n') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 检测gpu env = gym.make('CartPole-v0') env.seed(1) # 设置env随机种子 n_states = env.observation_space.shape[0] n_actions = env.action_space.n agent = DQN(n_states=n_states, n_actions=n_actions, device=device, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay, policy_lr=cfg.policy_lr, memory_capacity=cfg.memory_capacity, batch_size=cfg.batch_size) rewards = [] moving_average_rewards = [] ep_steps = [] log_dir=os.path.split(os.path.abspath(__file__))[0]+"/logs/train/" + SEQUENCE writer = SummaryWriter(log_dir) for i_episode in range(1, cfg.train_eps+1): state = env.reset() # reset环境状态 ep_reward = 0 for i_step in range(1, cfg.train_steps+1): action = agent.choose_action(state) # 根据当前环境state选择action next_state, reward, done, _ = env.step(action) # 更新环境参数 ep_reward += reward agent.memory.push(state, action, reward, next_state, done) # 将state等这些transition存入memory state = next_state # 跳转到下一个状态 agent.update() # 每步更新网络 if done: break # 更新target network,复制DQN中的所有weights and biases if i_episode % cfg.target_update == 0: agent.target_net.load_state_dict(agent.policy_net.state_dict()) print('Episode:', i_episode, ' Reward: %i' % int(ep_reward), 'n_steps:', i_step, 'done: ', done,' Explore: %.2f' % agent.epsilon) ep_steps.append(i_step) rewards.append(ep_reward) # 计算滑动窗口的reward if i_episode == 1: moving_average_rewards.append(ep_reward) else: moving_average_rewards.append( 0.9*moving_average_rewards[-1]+0.1*ep_reward) writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode) writer.close() print('Complete training!') ''' 保存模型 ''' save_model(agent,model_path=SAVED_MODEL_PATH) '''存储reward等相关结果''' save_results(rewards,moving_average_rewards,ep_steps,tag='train',result_path=RESULT_PATH)
def train_dqn(episodes, env, render_frequency=0): now = datetime.datetime.now() id = f'{now.hour}{now.minute}' episode_rewards = [] agent = DQN(env, params) best_score = 0 for episode in range(episodes): rendering = render_frequency and episode % render_frequency == 0 and isinstance( env, HeadlessSnake) state = env.reset( ) # Reset enviroment before each episode to start fresh if rendering: renderer = Renderer(env, episode + 1) env.update_episode(episode + 1) # state = np.reshape(state, (1, env.state_space)) total_reward = 0 max_steps = 10000 for step in range(max_steps): # 1. Find next action using the Epsilon-Greedy exploration Strategy action = agent.get_action(state) # 2. perform action in enviroment next_state, reward, done, _ = env.step(action) total_reward += reward # next_state = np.reshape(next_state, (1, env.state_space)) if rendering: renderer.update() # 3. Update the Q-function (train model) agent.remember(state, action, reward, next_state, done) agent.train_with_experience_replay() # 4. Change exploration vs. explotation probability agent.update_exploration_strategy(episode) state = next_state if done: print( f'episode: {episode+1}/{episodes}, score: {total_reward}, steps: {step}, ' f'epsilon: {agent.epsilon}, highscore: {env.maximum}') save_model(id, agent, best_score, total_reward) break if rendering: renderer.bye() save_model(id, agent, best_score, total_reward) episode_rewards.append(total_reward) return episode_rewards
def run(ep,train=False): pygame.init() loss=[] agent = DQN(3, 5) env=pongGame() weights_filepath = 'PongGame.h5' if train==False: agent.model.load_weights(weights_filepath) print("weights loaded") for e in range(ep): for event in pygame.event.get(): if event.type == pygame.QUIT: pygame.quit() quit() state = env.reset() state = np.reshape(state, (1, 5)) score = 0 max_steps = 1000 for i in range(max_steps): action = agent.act(state) reward, next_state, done = env.step(action) score += reward next_state = np.reshape(next_state, (1, 5)) agent.remember(state, action, reward, next_state, done) state = next_state if train==True: agent.replay() if done: print("episode: {}/{}, score: {}".format(e, ep, score)) break loss.append(score) if train: agent.model.save_weights("PongGame.h5") return loss
def main(args): # load env env = gym.make('CartPole-v0') # load agent agent = DQN(env) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # training loop for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # sample actions action = agent.sample_action(state, policy='greedy') # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep + 1, total_rewards))
def train(game): agent = DQN(game) for i in tqdm(range(TRAIN_GAMES)): game.new_episode() previous_variables = None previous_img = None done = False local_history = [] total_reward = 0 while not done: state = game.get_state() img = state.screen_buffer variables = state.game_variables if previous_variables is None: previous_variables = variables if previous_img is None: previous_img = img action = agent.act(img) reward = game.make_action(action) done = game.is_episode_finished() reward = (reward + calculate_additional_reward(previous_variables, variables)) / 100 total_reward += reward local_history.append([previous_img, img, reward, action, done]) previous_variables = variables previous_img = img if total_reward >= 0: for previous_state, state, reward, action, done in local_history: agent.remember(previous_state, state, reward, action, done) agent.train()
def main(): get_env_version() cfg = DQNConfig(env="CartPole-v0", train_eps=200) # cfg = DQNConfig(env="MountainCar-v0", train_eps=500) get_env_information(env_name=cfg.env) env = gym.make(cfg.env) env.seed(0) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = DQN(state_dim, action_dim, cfg) rewards, smooth_rewards = train(cfg, env, agent) os.makedirs(cfg.result_path) agent.save(path=cfg.result_path) save_results(rewards, smooth_rewards, tag='train', path=cfg.result_path) plot_rewards(rewards, smooth_rewards, tag='train', env=cfg.env, algo=cfg.algo, path=cfg.result_path)
def test_dqn(env): agent = DQN(env, params) agent.load_model(sys.argv[1], sys.argv[2]) state = env.reset() # Reset enviroment before each episode to start fresh state = np.reshape(state, (1, env.state_space)) max_steps = 10000 total_reward = 0 for step in range(max_steps): action = agent.get_action(state) next_state, reward, done, _ = env.step(action) state = np.reshape(next_state, (1, env.state_space)) total_reward += reward time.sleep(0.1) if done: print(f'Score: {total_reward}, steps: {step}') break return
def main(args): # load env env = gym.make('CartPole-v0') # load agent agent = DQN(env) agent.construct_model(args.gpu) # load model or init a new saver = tf.train.Saver() if args.model_path is not None: # reuse saved model saver.restore(agent.sess, args.model_path) else: # build a new model agent.init_var() # training loop for ep in range(args.ep): # reset env total_rewards = 0 state = env.reset() while True: env.render() # sample actions action = agent.sample_action(state, policy='greedy') # act! next_state, reward, done, _ = env.step(action) total_rewards += reward # state shift state = next_state if done: break print('Ep%s Reward: %s ' % (ep+1, total_rewards))
def train(env): if env == "MountainCar-v0": agent = DQN(2, 3) play(env, agent, until=195, ckpt=True) else: agent = DQN(state_dim=(210, 160, 3), n_actions=14) play(env, agent) agent.save_model(env)
def main(): sess = tf.Session(config=cf.tf_config) dqn = DQN(cf, sess) sess.run(tf.global_variables_initializer()) if bool(args.e): dqn.evaluate(load_model=True) else: dqn.learn() sess.close()
def _build_layer(self): # ALL NEURONS ARE BINARY EXCEPT OUTPUT num_outputs = self.args['num_outputs'] if self.ID == self.args[ 'num_layers'] else 2 for ID in range(self.out_shape): if self.args['neuron_type'] == 'PG': neuron = PG(args=self.args, in_shape=self.in_shape, ID=ID, num_outputs=num_outputs) elif self.args['neuron_type'] == 'DQN': neuron = DQN(args=self.args, in_shape=self.in_shape, ID=ID, num_outputs=num_outputs) else: neuron = Random(args=self.args, in_shape=self.in_shape, ID=ID, num_outputs=num_outputs) self.neurons.append(neuron)
def train_model(): iteration = 0 loss_val = np.infty game_length = 0 total_max_q = 0 mean_max_q = 0.0 done = True state = [] dqn = DQN() env = gym.make("MsPacman-v0") X_state = tf.placeholder( tf.float32, shape=[None, input_height, input_width, input_channels]) online_q_values, online_vars = dqn.create_model(X_state, "qnetwork_online") target_q_values, target_vars = dqn.create_model(X_state, "qnetwork_target") copy_ops = [ target_var.assign(online_vars[var_name]) for var_name, target_var in target_vars.items() ] copy_online_to_target = tf.group(*copy_ops) X_action, global_step, loss, training_op, y = define_train_variables( online_q_values) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: restore_session(copy_online_to_target, init, saver, sess) while True: step = global_step.eval() if step >= n_steps: break iteration += 1 print( "\rIteration {}\tTraining step {}/{} ({:.1f})%\tLoss {:5f}\tMean Max-Q {:5f} " .format(iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="") state = skip_some_steps(done, env, state) done, q_values, next_state = evaluate_and_play_online_dqn( X_state, env, online_q_values, state, step) state = next_state mean_max_q = compute_statistics(done, game_length, mean_max_q, q_values, total_max_q) if iteration < training_start or iteration % training_interval != 0: continue loss_val = train_online_dqn(X_action, X_state, loss, sess, target_q_values, training_op, y) # Copy the online DQN to the target DQN if step % copy_steps == 0: copy_online_to_target.run() # Save model if step % save_steps == 0: saver.save(sess, checkpoint_path)
def run( agent_type="dqn", hidden_layer_size=32, gamma=1.0, min_epsilon=0.001, learning_rate=2.5e-4, env_name="CartPole-v0", num_episodes=3000, log_interval=100, replay_buffer_capacity=10**5, use_prioritized_experience_buffer=False, max_steps_per_episode = 10000, batch_size = 32, use_soft_update = False, online_update_period = 1, target_update_tau = 1, target_sync_period = 100, ): env = gym.make(env_name) cfg = { "type": agent_type, "network": { "type": "dense", "hidden_layers": (hidden_layer_size, hidden_layer_size), }, "gamma": gamma, "min_epsilon": min_epsilon } agent = DQN( cfg, env.observation_space.shape, env.action_space.n, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss_function=tf.keras.losses.MeanSquaredError(), ) if use_prioritized_experience_buffer: buffer = PrioritizedReplayBuffer( size=replay_buffer_capacity, alpha=0.6, anneal_alpha_rate=1e-5, anneal_beta_rate=1e-5 ) else: buffer = UniformReplayBuffer(size=replay_buffer_capacity) observer = [ AverageObserver(log_interval), MaximumObserver(log_interval) ] train( env, agent, buffer, num_episodes=num_episodes, max_steps_per_episode=max_steps_per_episode, batch_size=batch_size, online_update_period=online_update_period, target_sync_period=target_sync_period, log_interval=log_interval, use_soft_update=use_soft_update, target_update_tau=target_update_tau, observer=observer )
def main(): env = retro.make(game='Frogger-Genesis', use_restricted_actions=retro.Actions.DISCRETE) gamma = 0.99 copy_step = 25 num_actions = env.action_space.n num_states = len(env.observation_space.sample()) hidden_units = [200, 200] max_experiences = 10000 min_experiences = 100 batch_size = 32 lr = 1e-2 current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/dqn/' + current_time summary_writer = tf.summary.create_file_writer(log_dir) # For stable weights, use one net to train, and copy their weights over to the TargetNet every copy_steps TrainNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences, min_experiences=min_experiences, batch_size=batch_size, lr=lr, hidden_units=hidden_units, num_states=num_states) TargetNet = DQN(num_actions=num_actions, gamma=gamma, max_experiences=max_experiences, min_experiences=min_experiences, batch_size=batch_size, lr=lr, hidden_units=hidden_units, num_states=num_states) # Loading check while True: if os.path.exists(save_dir): if input("\n\nWould you like to load the previous network weights? (y/n) ") == 'y': # load weights and copy to train net TargetNet.load_model(save_path) TrainNet.copy_weights(TargetNet) print("Loaded model weights...") break elif input("\nWould you like to delete the old checkpoints and start again? (y/n)") == 'y': shutil.rmtree(save_dir) print("Removed old checkpoint...") break else: break N = 50000 total_rewards = np.empty(N) epsilon = 0.99 decay = 0.9999 min_epsilon = 0.1 # play N games for n in range(N): epsilon = max(min_epsilon, epsilon * decay) total_reward = play_game(env, TrainNet, TargetNet, epsilon, copy_step) total_rewards[n] = total_reward avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean() with summary_writer.as_default(): tf.summary.scalar("episode reward", total_reward, step=n) tf.summary.scalar("running avg reward(100)", avg_rewards, step=n) if n % 100 == 0: print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards) # save the model weights TargetNet.save_model(save_path) print("avg reward for last 100 episodes:", avg_rewards) if create_video: make_video(env, TrainNet) env.close()
import matplotlib.pyplot as plt from collections import deque import gym from gym.wrappers import Monitor from agent import DQN, preprocess import numpy as np import gym_ple if __name__ == '__main__': N_EP = 10000 N_SAVE = 500 env = gym.make('FlappyBird-v0') agent = DQN(env) scores = deque(maxlen=100) for i in range(N_EP): score = 0 ob = env.reset() # Stack observations pre_ob = preprocess(ob) pre_ob = pre_ob.reshape(1, 100, 100) ob_stack = np.stack((pre_ob, ) * 4, -1) pre_ob = ob_stack while True: action = agent.act(pre_ob, step=i) ob, reward, done, _ = env.step(action)
def main(args): set_random_seed(args.seed) env = gym.make('CartPole-v0') agent = DQN(env, args) agent.construct_model(args.gpu) # load pretrained models or init new a model. saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_mean_rewards = None rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): # pick action action = agent.sample_action(state, policy='egreedy') # Execution action. next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # Learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append( rewards_history[-1] * 0.9 + ep_rewards * 0.1) # Decay epsilon if agent.epsilon > args.final_epsilon: agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep # Evaluate during training if ep % args.log_every == args.log_every-1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.timestep_limit): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print('Episode: %d Average Reward: %.2f' % (ep + 1, current_mean_rewards)) # save model if current model outpeform the old one if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards): best_mean_rewards = current_mean_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_mean_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model saved %s' % save_name) # plot training rewards plt.plot(steps_history, rewards_history) plt.xlabel('steps') plt.ylabel('running avg rewards') plt.show()
def main(args): # Hyper parameters MAX_EPISODE = 10000 # training episode INITIAL_EPSILON = 0.5 # starting value of epsilon FINAL_EPSILON = 0.01 # final value of epsilon TEST_EPISODE = 100 env = gym.make('CartPole-v0') agent = DQN(env, double_q=args.double) agent.construct_model(args.gpu) saver = tf.train.Saver(max_to_keep=2) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 mean_rewards = None # Training for ep in range(MAX_EPISODE): state = env.reset() for step in range(env.spec.timestep_limit): # pick action action = agent.sample_action(state, policy='egreedy') # Execution action. next_state, reward, done, debug = env.step(action) # modified reward to speed up learning reward = 0.1 if not done else -1 # Learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break # Update epsilon if agent.epsilon > FINAL_EPSILON: agent.epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / MAX_EPISODE # Evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(TEST_EPISODE): state = env.reset() for j in range(env.spec.timestep_limit): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break mean_rewards = total_reward / float(TEST_EPISODE) print('Episode:', ep + 1, ' Average Reward:', mean_rewards) print('Global steps:', agent.global_step) if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(mean_rewards, 2)) + '_' \ + str(ep_base+ep+1) saver.save(agent.sess, save_name)
step += 1 #end of game #print("game over") #print(observation, '\n') result.append(np.max(observation)) if __name__ == "__main__": GAME = chessboard("GAME") result = [] RL = DQN( GAME.n_actions, GAME.n_features, learning_rate=0.01, reward_decay=0.93, e_greedy=0.93, replace_target_iter=100, memory_size=20000, # output_graph=True ) run_game() RL.show_cost() import matplotlib.pyplot as plt plt.plot(np.arange(len(result)), result) plt.ylabel('max') plt.xlabel('games') plt.show()
logger.addHandler(fh) # Check whether cuda is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Initialise the game env = gym.make('ChromeDino-v0') # env = gym.make('ChromeDinoNoBrowser-v0') env = make_dino(env, timer=True, frame_stack=True) # Get the number of actions and the dimension of input n_actions = env.action_space.n # ----------- Nature DQN --------------- dqn = DQN(n_actions, device) dqn.train(env, logger) # dqn.load("./trained/dqn.pkl") # dqn.test(env) # ----------- Prioritized DQN --------------- # dqn_p = DQNPrioritized(n_actions, device) # dqn_p.train(env, logger) # dqn_p.load("./trained/dqn_p.pkl") # dqn_p.test(env) # ----------- Double DQN ---------------- # double_dqn = DoubleDQN(n_actions, device) # double_dqn.train(env, logger) # double_dqn.load("./trained/double-dqn.pkl")
BATCH_SIZE = 128 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() init_screen = get_screen(env).to(device) _, screen_ch, screen_height, screen_width = init_screen.shape n_action = env.action_space.n policy_net = DQN(screen_ch * args.num_frames, screen_height, screen_width, n_action).to(device) target_net = DQN(screen_ch * args.num_frames, screen_height, screen_width, n_action).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 def select_action(state): global steps_done sample = random.random() eps_threshold = EPS_END + (EPS_START - EPS_END) * \
} # MountainCar hyper parameters MountainCar_HYPER_PARAMETERS = { 'MEM_REPLAY_SIZE': 150000, 'BATCH_SIZE': 512, 'GAMMA': 0.999, 'EPS_START': 1, 'EPS_END': 0.1, 'EPS_DECAY': 1000, 'EVALUATE_FREQUENCY': 1, 'ALTER_TARGET_UPDATE_RATE': 0.999, 'MAX_EPISODES': 1000 } # Acrobot hyper parameters Acrobot_HYPER_PARAMETERS = { 'MEM_REPLAY_SIZE': 150000, 'BATCH_SIZE': 128, 'GAMMA': 0.999, 'EPS_START': 1, 'EPS_END': 0.1, 'EPS_DECAY': 1000, 'EVALUATE_FREQUENCY': 20, 'ALTER_TARGET_UPDATE_RATE': 0.995, 'MAX_EPISODES': 1000 } DQN.train_model(MountainCar_HYPER_PARAMETERS, envs[2]) DDQN.train_model(MountainCar_HYPER_PARAMETERS, envs[2])
SAVED_MODEL_PATH = 'D:/unity2017/water/ai/saved_model/' # 构建Socket实例、设置端口号和监听队列大小 listener = socket.socket(socket.AF_INET, socket.SOCK_STREAM) listener.bind(('127.0.0.1', 50213)) listener.listen(5) print('Waiting for connect...') while True: client_executor, addr = listener.accept() if addr != None: break print('Accept new connection from %s:%s...' % addr) agent = DQN(pretrained=True) state = torch.zeros((150, 6), device=device, dtype=torch.float) state[0][5] = 0.26 state[0][1] = 4.75 state = state.unsqueeze(0) reward = 0 for i in range(6005): if i == 0: action = 50 #action = torch.zeros((1),device=device,dtype=torch.float,requires_grad=False) else: action = agent.choose_action(state) msg = client_executor.recv(16384).decode('utf-8') client_executor.send(bytes(str(action / 10 - 5).encode('utf-8')))
# main body import gym from agent import DQN, train from wrapper import AtariWrapper from replay import UniformReplayBuffer, PrioritizedReplayBuffer from observer import AverageObserver, MaximumObserver env = gym.make(config["env"]["name"]) if config["env"]["is_atari"]: env = AtariWrapper(env, **config["env"]["wrapper"]) agent = DQN( config["agent"], env.observation_space.shape, env.action_space.n, ) if config["buffer"]["use_per"]: buffer = PrioritizedReplayBuffer( size = config["buffer"]["size"], alpha = config["buffer"]["alpha"], beta = config["buffer"]["beta"], anneal_alpha_rate = config["buffer"]["anneal_alpha_rate"], anneal_beta_rate = config["buffer"]["anneal_beta_rate"] ) else: buffer = UniformReplayBuffer(config["buffer"]["size"]) observer = []
def main(args): env = gym.make("CartPole-v0") if args.seed >= 0: random_seed(args.seed) env.seed(args.seed) agent = DQN(env, args) model = get_model(out_dim=env.action_space.n, lr=args.lr) agent.set_model(model) rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.timestep_limit): # sample action action = agent.sample_action(state, policy="egreedy") # apply action next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # train agent.train(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append(rewards_history[-1] * 0.9 + ep_rewards * 0.1) # Decay epsilon if agent.epsilon > args.final_epsilon: decay = (args.init_epsilon - args.final_epsilon) / args.max_ep agent.epsilon -= decay # Evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.timestep_limit): if args.render: env.render() action = agent.sample_action(state, policy="greedy") state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print("Episode: %d Average Reward: %.2f" % (ep + 1, current_mean_rewards)) # plot training rewards plt.plot(steps_history, rewards_history) plt.xlabel("steps") plt.ylabel("running avg rewards") plt.show()
_process_data = my_process_data x = input('''To train model: train, To test a trained model: test, To train on different dataset: d: ''') if x == 'd': dataset = input('Enter name of dataset as "example_dataset.csv": ') try: raw = preprocess(dataset) except: print('Invalid dataset') raw = preprocess(dataset) actions = 2 states = 7 env = MyStocksEnv(raw, window_size=1, frame_bound=(1, 300)) agent = DQN(actions, states, 100) all_rewards = agent.train(env, 1000) elif x == 'test': raw = preprocess() env = MyStocksEnv(raw, window_size=1, frame_bound=(1, 300)) all_rewards = trained_test('dqn_model.h5', env) else: raw = preprocess() actions = 2 states = 7 env = MyStocksEnv(raw, window_size=1, frame_bound=(1, 300)) agent = DQN(actions, states, 100) all_rewards = agent.train(env, 1000) if all_rewards != 0: print(all_rewards)
def main(args): set_random_seed(args.seed) env = gym.make("CartPole-v0") agent = DQN(env, args) agent.construct_model(args.gpu) # load pre-trained models or init new a model. saver = tf.train.Saver(max_to_keep=1) if args.model_path is not None: saver.restore(agent.sess, args.model_path) ep_base = int(args.model_path.split('_')[-1]) best_mean_rewards = float(args.model_path.split('/')[-1].split('_')[0]) else: agent.sess.run(tf.global_variables_initializer()) ep_base = 0 best_mean_rewards = None rewards_history, steps_history = [], [] train_steps = 0 # Training for ep in range(args.max_ep): state = env.reset() ep_rewards = 0 for step in range(env.spec.max_episode_steps): # pick action action = agent.sample_action(state, policy='egreedy') # execution action. next_state, reward, done, debug = env.step(action) train_steps += 1 ep_rewards += reward # modified reward to speed up learning reward = 0.1 if not done else -1 # learn and Update net parameters agent.learn(state, action, reward, next_state, done) state = next_state if done: break steps_history.append(train_steps) if not rewards_history: rewards_history.append(ep_rewards) else: rewards_history.append(rewards_history[-1] * 0.9 + ep_rewards * 0.1) # decay epsilon if agent.epsilon > args.final_epsilon: agent.epsilon -= (args.init_epsilon - args.final_epsilon) / args.max_ep # evaluate during training if ep % args.log_every == args.log_every - 1: total_reward = 0 for i in range(args.test_ep): state = env.reset() for j in range(env.spec.max_episode_steps): action = agent.sample_action(state, policy='greedy') state, reward, done, _ = env.step(action) total_reward += reward if done: break current_mean_rewards = total_reward / args.test_ep print('Episode: %d Average Reward: %.2f' % (ep + 1, current_mean_rewards)) # save model if current model outperform the old one if best_mean_rewards is None or (current_mean_rewards >= best_mean_rewards): best_mean_rewards = current_mean_rewards if not os.path.isdir(args.save_path): os.makedirs(args.save_path) save_name = args.save_path + str(round(best_mean_rewards, 2)) \ + '_' + str(ep_base + ep + 1) saver.save(agent.sess, save_name) print('Model saved %s' % save_name) plt.plot(steps_history, rewards_history) plt.xlabel('steps') plt.ylabel('running avg rewards') plt.show()
print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) # Train Agent ################################################################## from agent import DQN, Double_DQN from training import train_agent agent = DQN(state_size=state_size, action_size=action_size, seed=0) def train(n_episodes=100, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode scores_window = deque(maxlen=100) # last 100 scores