def run(env, batch_size, agent, memory, discount, steps=300, episode_i=0, eps=.9, render=False, normalize=False): state = env.reset() done = False acc_reward = 0.0 loss = 0.0 for i in range(steps): if done: break # eps should decay overtime action = agent.move(state, eps=.9) # print("state:",state.shape,state) if normalize: state = featurize_state(state) next_state, reward, done, _ = env.step(action) acc_reward += reward memory.add((state, action, next_state, reward, done)) if render: env.render() if len(memory.memory) > batch_size: state_m, action_m, next_state_m, reward_m, done_m = zip( *memory.sample(batch_size)) state_m = np.array(state_m) action_m = np.array(action_m) next_state_m = np.array(next_state_m) reward_m = np.array(reward_m) done_m = np.array(done_m) q_m = agent.predict(next_state_m) actual_target_m = reward_m + (1. - done_m) * discount * np.amax( q_m, axis=1) targets = agent.predict(state_m) # assign the actual reward to the taken action for i, action in enumerate(action_m): targets[i, action] = actual_target_m[i] loss = agent.train(states=state_m, targets=targets) state = copy.copy(next_state) # print("acc_reward:", acc_reward) return acc_reward, i, loss
parser.add_argument('--model') #no training episodes parser.add_argument('--eps') parser.add_argument('--render') args = parser.parse_args() if args.tensorboard: writer = SummaryWriter() write_proc = subprocess.Popen(['tensorboard', '--logdir', '{}'.format(args.tensorboard)]) env = env.Environment(args.env) if args.alg == 'DQN': agent = agent.DQNAgent(env, args.mode, args.model, writer) try: if args.mode == 'train': agent.train(int(args.eps), args.render) elif args.mode == 'play': agent.play(int(args.eps)) except KeyboardInterrupt: print('PROCESS KILLED BY USER') finally: env.close() if args.tensorboard: write_proc.terminate()
import agent import environment import replay env = environment.Environment('Breakout-v0') replay = replay.ExperienceReplay(env) agent = agent.Agent(env, replay) # agent.restore() agent.train()
def run( agent_type="dqn", hidden_layer_size=32, gamma=1.0, min_epsilon=0.001, learning_rate=2.5e-4, env_name="CartPole-v0", num_episodes=3000, log_interval=100, replay_buffer_capacity=10**5, use_prioritized_experience_buffer=False, max_steps_per_episode = 10000, batch_size = 32, use_soft_update = False, online_update_period = 1, target_update_tau = 1, target_sync_period = 100, ): env = gym.make(env_name) cfg = { "type": agent_type, "network": { "type": "dense", "hidden_layers": (hidden_layer_size, hidden_layer_size), }, "gamma": gamma, "min_epsilon": min_epsilon } agent = DQN( cfg, env.observation_space.shape, env.action_space.n, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss_function=tf.keras.losses.MeanSquaredError(), ) if use_prioritized_experience_buffer: buffer = PrioritizedReplayBuffer( size=replay_buffer_capacity, alpha=0.6, anneal_alpha_rate=1e-5, anneal_beta_rate=1e-5 ) else: buffer = UniformReplayBuffer(size=replay_buffer_capacity) observer = [ AverageObserver(log_interval), MaximumObserver(log_interval) ] train( env, agent, buffer, num_episodes=num_episodes, max_steps_per_episode=max_steps_per_episode, batch_size=batch_size, online_update_period=online_update_period, target_sync_period=target_sync_period, log_interval=log_interval, use_soft_update=use_soft_update, target_update_tau=target_update_tau, observer=observer )
def main(args): if args.seed is not None: print("Setting random seed: %d" % args.seed) np.random.seed(args.seed) tf.random.set_seed(args.seed) job_dir = args.job_dir if args.job_dir.startswith('gs') else os.path.join( args.job_dir, datetime.now().strftime('%Y%m%d%H%M%s')) if not tf.io.gfile.exists(job_dir): tf.io.gfile.makedirs(job_dir) print('Job dir: %s' % job_dir) board = Board() agent = Agent(board.size, hidden_size=args.agent_net_size, num_conv=args.agent_net_conv) lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( args.lr, int((args.epoch_games * 60 * args.lr_decay_epochs) / args.batch_size), args.lr_decay) optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) metrics_step = tf.Variable(1, dtype=tf.int64) checkpoint = tf.train.Checkpoint(step=tf.Variable(1, dtype=tf.int64), optimizer=optimizer, net=agent) checkpoint_manager = tf.train.CheckpointManager(checkpoint, os.path.join( job_dir, 'checkpoints'), max_to_keep=None) if args.contest_to_update: temp_checkpoint_manager = tf.train.CheckpointManager( checkpoint, os.path.join(job_dir, 'temp_checkpoint'), max_to_keep=1) metrics_writer = tf.summary.create_file_writer( os.path.join(job_dir, 'metrics')) try: ray.init(num_cpus=args.num_cpus) checkpoint_manager.save() with metrics_writer.as_default(): for e in range(args.epochs): if args.contest_to_update: # Restore the last accepted agent parameters checkpoint.restore(checkpoint_manager.latest_checkpoint) # Benchmark if e % 5 == 0: t = time.time() for name, (wins, losses) in benchmark_agent( checkpoint_manager.latest_checkpoint, board.size, args).items(): tf.summary.scalar('benchmark/%s/wins' % name, wins / args.benchmark_games, step=metrics_step) tf.summary.scalar('benchmark/%s/losses' % name, losses / args.benchmark_games, step=metrics_step) tf.summary.scalar( 'benchmark/%s/draws' % name, (args.benchmark_games - wins - losses) / args.benchmark_games, step=metrics_step) ttrb = float(time.time() - t) tf.summary.scalar('perf/time_to_run_benchmarks', ttrb, step=metrics_step) print('Time to run benchmarks: %.4f' % ttrb) # Collect epoch samples print('Epoch: %d' % e) t = time.time() samples, stats = collect_samples( checkpoint_manager.checkpoints, board.size, args) ttcs = float(time.time() - t) for key, val in stats.items(): tf.summary.scalar('game_metrics/%s' % key, val, step=metrics_step) tf.summary.scalar('perf/time_to_collect_samples', ttcs, step=metrics_step) print('Time to collect samples: %.4f' % ttcs) for (states, action_probabilities, action_indices, state_values, rewards) in batches(samples, args.batch_size): if np.any(np.isnan(action_probabilities)): raise ValueError('NaN Action P') loss = train( agent, optimizer, tf.convert_to_tensor(states, dtype=tf.float32), tf.convert_to_tensor(action_probabilities, dtype=tf.float32), tf.convert_to_tensor(action_indices, dtype=tf.int32), tf.convert_to_tensor(state_values, dtype=tf.float32), tf.convert_to_tensor(rewards, dtype=tf.float32), ) tf.summary.scalar('train/loss', loss, step=metrics_step) tf.summary.scalar( 'train/mean_advantage', tf.reduce_mean( tf.convert_to_tensor(rewards, dtype=tf.float32) - tf.convert_to_tensor(state_values, dtype=tf.float32)), step=metrics_step) metrics_step.assign_add(1) checkpoint.step.assign_add(1) if args.contest_to_update: # Update parameters only if the new agent beats the old one. temp_checkpoint_manager.save() t = time.time() new_wins, old_wins = compare_agents( checkpoint_manager.latest_checkpoint, temp_checkpoint_manager.latest_checkpoint, args) tf.summary.scalar('perf/time_to_compare_agents', float(time.time() - t), step=metrics_step) tf.summary.scalar('train/new_agent_win_rate', new_wins / (new_wins + old_wins), step=metrics_step) if ((new_wins + old_wins) > 0) and ( new_wins / (new_wins + old_wins) >= args.win_rate_threshold): checkpoint_manager.save() else: checkpoint_manager.save() finally: ray.shutdown()
import agent from environment import GymEnvironment import tensorflow as tf env_agent = GymEnvironment() agent = agent.DQNAgent(environment=env_agent) with tf.Session() as sess: agent.build_dqn(sess) sess.run(tf.global_variables_initializer()) agent.train(episodes=50000)
import gym import model import agent env = gym.make('CartPole-v0') model = model.Model(num_actions=env.action_space.n) obs = env.reset() #fetch agent class agent = agent.Agent(model) #train agent rewards_history = agent.train(env) print("Finished training, testing...") #test fully trained agent print("%d out of 200" % agent.test(env)) # score out of 200
env = AtariWrapper(env, **config["env"]["wrapper"]) agent = DQN( config["agent"], env.observation_space.shape, env.action_space.n, ) if config["buffer"]["use_per"]: buffer = PrioritizedReplayBuffer( size = config["buffer"]["size"], alpha = config["buffer"]["alpha"], beta = config["buffer"]["beta"], anneal_alpha_rate = config["buffer"]["anneal_alpha_rate"], anneal_beta_rate = config["buffer"]["anneal_beta_rate"] ) else: buffer = UniformReplayBuffer(config["buffer"]["size"]) observer = [] if config["train"]["display_average_reward"]: observer.append(AverageObserver(config["train"]["log_interval"])) if config["train"]["display_max_reward"]: observer.append(MaximumObserver(config["train"]["log_interval"])) c = config["train"] c.update(config["misc"]) c["observer"] = observer history = train(env, agent, buffer, **c) logging.info(history)
print("result", result) if __name__ == '__main__': train( Checkers, 'checkers-4', model_width=64, #alpha_steps={0: 0.1, 5: 0.01, 10: 0.001, 20: 0.0001}, alpha_steps={0: 0.001}, #discount_steps={0:0.7, 30: 0.9, 60: 0.99}, discount_steps={0: 0.99}, epsilon_steps={ 0: 1, 5: 0.7, 20: 0.5, 50: 0.3 }, #epsilon_steps={0: 1}, num_models=2, epoch_size=1000, num_epochs=1000, sample_size=1000, num_samples=100, play_at_end=False, saveDir='/home/sam/scratch/tflow', loadModels=False) """ train(TicTacToe, 'tictactoe-small', model_width=64, alpha_steps={0: 0.01, 10: 0.001, 30: 0.0001},
def run( agent_type="dqn", gamma=1.0, min_epsilon=0.1, learning_rate=2.5e-4, env_name="MsPacman-v0", use_wrapper=True, num_episodes=1000, log_interval=100, replay_buffer_capacity=10**5, use_prioritized_experience_buffer=False, max_steps_per_episode = 10000, batch_size = 32, use_soft_update = False, online_update_period = 1, target_update_tau = 1, target_sync_period = 100, decay_rate=1e-5, num_saves = 0, saved_model_dir = None, warm_up = 10000 ): env = gym.make(env_name) if use_wrapper: # convert (210, 160, 3) to (84, 84, 1) env = AtariWrapper(env) cfg = { "type": agent_type, "network": { "type": "conv2d", "structure": None, }, "gamma": gamma, "min_epsilon": min_epsilon } agent = DQN( cfg, env.observation_space.shape, env.action_space.n, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss_function=tf.keras.losses.MeanSquaredError(), ) if use_prioritized_experience_buffer: buffer = PrioritizedReplayBuffer( size=replay_buffer_capacity, alpha=0.6, anneal_alpha_rate=1e-5, anneal_beta_rate=1e-5 ) else: buffer = UniformReplayBuffer(size=replay_buffer_capacity) observer = [ AverageObserver(log_interval), MaximumObserver(log_interval) ] train( env, agent, buffer, num_episodes=num_episodes, max_steps_per_episode=max_steps_per_episode, batch_size=batch_size, online_update_period=online_update_period, target_sync_period=target_sync_period, log_interval=log_interval, use_soft_update=use_soft_update, target_update_tau=target_update_tau, observer=observer, decay_rate=decay_rate, num_saves=num_saves, saved_model_dir=saved_model_dir, warm_up=10000 )