def __init__(self, config): self.config = config # Create session self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) # Create networks self.prior_network = PolicyNetwork( scope=config.prior_network, temperature=config.prior_temperature, use_symmetry=config.use_symmetry) self.rollout_network = PolicyNetwork( scope=config.rollout_network, temperature=config.rollout_temperature, reuse=config.prior_network == config.rollout_network, use_symmetry=config.use_symmetry) self.value_network = ValueNetwork( scope=config.value_network, use_symmetry=config.use_symmetry) # Load networks from checkpoints run_dir = util.run_directory(config) util.restore_network_or_fail(self.session, run_dir, self.prior_network) util.restore_network_or_fail(self.session, run_dir, self.rollout_network) util.restore_network_or_fail(self.session, run_dir, self.value_network) # Create queues self.prior_queue = AllQueue() self.rollout_queue = AllQueue(maxsize=16) self.value_queue = AllQueue(maxsize=16) self.new_game()
def selfplay_with_noise(sess, trained_model_dir, uniform_sample=False): print("-----------Load Trained Model------------") policy_estimator = PolicyNetwork(input_size=len(env.state), output_size=env.action_space.n) policy_estimator.restore(sess=sess, checkpoint_file=os.path.join(trained_model_dir, 'episodes_99999_model.ckpt')) reward_estimator = RewardNetwork(input_size=len(env.state), output_size=1) reward_estimator.restore(sess=sess, checkpoint_file=os.path.join(trained_model_dir, 'episodes_99999_model.ckpt')) print("-----------Simulation with Different Noises------------") noise_test_dir = os.path.join(trained_model_dir, "noise-test") if not os.path.exists(noise_test_dir): os.makedirs(noise_test_dir) tag_str = "uniform.winrate.txt" if uniform_sample else "weighted.winrate.txt" filename = os.path.join(noise_test_dir, tag_str) fp = open(filename, "w") fp.write("noise\twinrate\n") noises = [] accuracys = [] for i in range(15): noise = i * 0.01 win_rate = self_play(env, policy_estimator, reward_estimator, uniform_sample=uniform_sample, num_episodes=1000, noise=noise) # Keep statistics for noise-accuracy curve noises.append(noise) accuracys.append(win_rate) fp.write(str(noise) + "\t" + str(win_rate) + "\n") fp.flush() fp.close()
def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.')
def train(): tf.reset_default_graph() # Create a global step variable global_step = tf.Variable(0, name="global_step", trainable=False) policy_estimator = PolicyNetwork(input_size=len(env.state), output_size=env.action_space.n, summaries_dir=experiment_dir) value_estimator = ValueNetwork(input_size=len(env.state), output_size=1) # Object-aware Reward Network reward_estimator = ObjectAwareRewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n) # # Reward Network # reward_estimator = RewardNetwork(input_size=len(env.state), output_size=1, action_num=env.action_space.n) saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.initialize_all_variables()) reinforce(env, policy_estimator, value_estimator, reward_estimator, max_num_episodes, sess, discount_factor=0.99, uniform_sample=False, saver=saver, model_dir=model_dir, figure_dir=figure_dir)
def create_new_opponent(self, name): # Create clone of policy_player clone = PolicyNetwork(name) self.session.run(self.policy_network.assign(clone)) util.save_network(self.session, self.run_dir, clone) new_opponent = PolicyPlayer(clone, self.session) self.opponents.decrease_win_rates() self.opponents.add_opponent(new_opponent)
def __init__(self, config): self.config = config session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) self.random_player = RandomPlayer() self.exploratory_network = PolicyNetwork(config.exploratory_network) self.exploratory_player = PolicyPlayer(self.exploratory_network, session) self.playout_network = PolicyNetwork( config.playout_network, reuse=config.exploratory_network == config.playout_network) self.playout_player = PolicyPlayer(self.playout_network, session) self.run_dir = util.run_directory(config) util.restore_network_or_fail(session, self.run_dir, self.exploratory_network) util.restore_network_or_fail(session, self.run_dir, self.playout_network)
def __init__(self, config): self.config = config self.run_dir = util.run_directory(config) self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) self.policy_network = PolicyNetwork('policy') self.policy_player = PolicyPlayer(self.policy_network, self.session) util.restore_or_initialize_network(self.session, self.run_dir, self.policy_network) # Train ops self.create_train_op(self.policy_network) self.writer = tf.summary.FileWriter(self.run_dir) util.restore_or_initialize_scope(self.session, self.run_dir, self.training_scope.name) self.opponents = Opponents( [RandomPlayer(), RandomThreatPlayer(), MaxThreatPlayer()]) self.opponents.restore_networks(self.session, self.run_dir)
def restore_networks(self, session, run_dir): opponents_file = os.path.join(run_dir, 'opponents') if os.path.exists(opponents_file): with open(opponents_file) as f: for line in f.readlines(): opponent_name, win_rate_string = line.strip().split() win_rate = float(win_rate_string) if opponent_name[:8] == 'network-': print('Restoring %s' % opponent_name) network = PolicyNetwork(opponent_name) util.restore_network_or_fail(session, run_dir, network) opponent = PolicyPlayer(network, session) self.win_rates[opponent] = win_rate else: for opponent in self.win_rates.keys(): if opponent_name == opponent.name: self.win_rates[opponent] = win_rate
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ num_actions = list_of_envs[0].action_space.n num_envs = len(list_of_envs) policy = PolicyNetwork(num_actions) models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) memories = [ ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs) ] use_cuda = torch.cuda.is_available() if use_cuda: policy.cuda() for model in models: model.cuda() optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map current_screen = get_screen(env) state = current_screen # - last_screen # Select and perform an action action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) steps_done[i_env] += 1 current_time[i_env] += 1 _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) if done: print( "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) env.reset() episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
def main(): # ENVIROMENT env_name = "CartPole-v1" # env_name = "LunarLander-v2" env = gym.make(env_name) n_actions = env.action_space.n feature_dim = env.observation_space.shape[0] # PARAMETERS learning_rate = 1e-3 state_scale = 1.0 reward_scale = 1.0 clip = 0.2 n_epoch = 4 max_episodes = 10 max_timesteps = 200 batch_size = 32 max_iterations = 200 gamma = 0.99 gae_lambda = 0.95 entropy_coefficient = 0.01 # NETWORK value_model = ValueNetwork(in_dim=feature_dim).to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() observation = env.reset() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = -500 # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] for ite in tqdm(range(max_iterations), ascii=True): if ite % 5 == 0: torch.save({ 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward}, model_path) episode_ite, running_reward = collect(episode_ite, running_reward, env, max_episodes, max_timesteps, state_scale, reward_scale, writer, history, policy_model, value_model, gamma, gae_lambda, device) # Here we have collected N trajectories. history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) policy_loss, value_loss, train_ite = train_network(data_loader, policy_model, value_model, policy_optimizer, value_optimizer ,n_epoch, clip, train_ite, writer, entropy_coefficient) for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) writer.add_scalar("Running Reward", running_reward, epoch_ite) if (running_reward > env.spec.reward_threshold): print("\nSolved!") break
self.board.push(chess.Move.from_uci(self.move)) def render(self): print(self.board) print("Move: ", self.move) def print_game(self): game = chess.pgn.Game() node = game for move in self.board.move_stack: node = node.add_variation(move) print(game) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") policy_model = PolicyNetwork().to(device) # env = ChessEnv() # env.update_rival(policy_model, device) # state = env.reset(True) # state, reward, done = env.step(env.legal_actions()[0]) # state, reward, done = env.step(env.legal_actions()[0]) # state, reward, done = env.step(env.legal_actions()[0]) # state, reward, done = env.step(env.legal_actions()[0]) # print(reward) # env.render() # env.print_game() # env = ChessEnv() # observation = env.reset(True) # env.update_rival(policy_model, device)
there's a good chance it could slow things down because we have to move data back and forth between CPU and GPU. Regardless I'm leaving this in here. For those of you with GPUs this will mean that you will need to move your tensors to GPU. """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") numrun = 1 for run in range(numrun): env = make_env() in_size = env.observation_space.shape[0] num_actions = env.action_space.n network = network_factory(in_size, num_actions, env) network.to(device) pe = PolicyNetwork(network) # Load policy to test #pe.network.load_state_dict(torch.load('saved_network_50000_baseline.pkl')) ve = ValueNetwork(in_size) ep_returns = reinforce(env, pe, ve, episodes) #,ve , loss_policy, loss_value #fwrite = open('runs_data/'+str(run)+'.pkl','wb') #fwrite = open('runs_data/0.pkl','wb') #pickle.dump(ep_returns, fwrite) #fwrite.close() window = 100
def main(env_name, lr, state_scale, reward_scale, clip, train_epoch, max_episodes, max_timesteps, batch_size, max_iterations, gamma, gae_lambda, entropy_coefficient, start_running_reward, update_rate): # ENVIROMENT env_name = env_name env = ChessEnv() # PARAMETERS learning_rate = lr state_scale = state_scale reward_scale = reward_scale clip = clip n_epoch = train_epoch max_episodes = max_episodes max_timesteps = max_timesteps batch_size = batch_size max_iterations = max_iterations gamma = gamma gae_lambda = gae_lambda entropy_coefficient = entropy_coefficient # NETWORK value_model = ValueNetwork().to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork().to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = start_running_reward # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] # Create SavedEnvs queue SavedEnv = queue.SimpleQueue() for _ in range(max_episodes): env = ChessEnv() SavedEnv.put((env, env.reset(), 0)) # START ITERATING for ite in tqdm(range(max_iterations), ascii=True): # Load model to rival each update_rate epochs if ite % update_rate == 0: print("\nUpdating") rival_policy = PolicyNetwork().to(device) rival_policy.load_state_dict(policy_model.state_dict()) if ite % 5 == 0: torch.save( { 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward }, model_path) print("\nSimulating") start_simulation = time.perf_counter() q = queue.SimpleQueue() env_list = [] while not SavedEnv.empty(): env_list.append(SavedEnv.get()) threads = [] for saved_env in env_list: t = threading.Thread(target=collect, args=[ q, env_name, saved_env, SavedEnv, max_timesteps, state_scale, reward_scale, policy_model, value_model, gamma, gae_lambda, device, rival_policy ]) t.start() threads.append(t) for t in threads: t.join() # for saved_env in env_list: # if ite % 20 == 0: # update_policy = True # else: # update_policy = False # collect(q, env_name, saved_env, # SavedEnv, max_timesteps, state_scale, reward_scale, # policy_model, value_model, gamma, # gae_lambda, device, update_policy) avg_episode_reward = [] # Write all episodes from queue to history buffer while not q.empty(): episode, done = q.get() history.episodes.append(episode) avg_episode_reward.append((episode.reward, done)) end_simulation = time.perf_counter() print(f"Simulation time: {end_simulation-start_simulation:.2f} ") for ep_reward, done in avg_episode_reward: if done: running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward writer.add_scalar("Average Episode Reward", ep_reward, episode_ite) episode_ite += 1 # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward) # Here we have collected N trajectories and prepare dataset history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) print("Training") policy_loss, value_loss, train_ite = train_network( data_loader, policy_model, value_model, policy_optimizer, value_optimizer, n_epoch, clip, train_ite, writer, entropy_coefficient) end_training = time.perf_counter() print(f"Training time: {end_training-end_simulation:.2f}") for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) writer.add_scalar("Running Reward", running_reward, epoch_ite) if (running_reward > 0): print("\nSolved!") break
def main(): # ENVIROMENT # env_name = "CartPole-v1" # env_name = "LunarLander-v2" # env_name = "Acrobot-v1" env_name = "MountainCar-v0" env = gym.make(env_name) n_actions = env.action_space.n feature_dim = env.observation_space.shape[0] # PARAMETERS learning_rate = 1e-3 state_scale = 1.0 reward_scale = 1.0 clip = 0.2 n_epoch = 4 max_episodes = 10 max_timesteps = 100 batch_size = 32 max_iterations = 1000 gamma = 0.99 gae_lambda = 0.95 entropy_coefficient = 0.01 env_threshold = env.spec.reward_threshold # NETWORK value_model = ValueNetwork(in_dim=feature_dim).to(device) value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) # INIT history = History() observation = env.reset() epoch_ite = 0 episode_ite = 0 train_ite = 0 running_reward = -500 # TENSORBOARD timestr = time.strftime("%d%m%Y-%H%M%S-") log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \ str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \ "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda) writer = SummaryWriter(log_dir=log_dir) # LOAD MODEL # Create folder models if not Path("./models").exists(): print("Creating Models folder") Path("./models").mkdir() model_path = Path("./models/" + env_name + ".tar") if model_path.exists(): print("Loading model!") #Load model checkpoint = torch.load(model_path) policy_model.load_state_dict(checkpoint['policy_model']) policy_optimizer.load_state_dict(checkpoint['policy_optimizer']) value_model.load_state_dict(checkpoint['value_model']) value_optimizer.load_state_dict(checkpoint['value_optimizer']) running_reward = checkpoint['running_reward'] EnvQueue = queue.SimpleQueue() for _ in range(max_episodes): env = gym.make(env_name) observation = env.reset() EnvQueue.put((env, observation, 0)) for ite in tqdm(range(max_iterations), ascii=True): if ite % 5 == 0: torch.save( { 'policy_model': policy_model.state_dict(), 'policy_optimizer': policy_optimizer.state_dict(), 'value_model': value_model.state_dict(), 'value_optimizer': value_optimizer.state_dict(), 'running_reward': running_reward }, model_path) q = queue.SimpleQueue() env_list = [] while not EnvQueue.empty(): env_list.append(EnvQueue.get()) threads = [] for env in env_list: t = threading.Thread(target=collect, args=[ q, env_name, env, EnvQueue, max_timesteps, state_scale, reward_scale, policy_model, value_model, gamma, gae_lambda, device ]) t.start() threads.append(t) for t in threads: t.join() avg_episode_reward = [] # Write all episodes from queue to history buffer while not q.empty(): episode, done = q.get() history.episodes.append(episode) avg_episode_reward.append((episode.reward, done)) for ep_reward, done in avg_episode_reward: if done: running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward writer.add_scalar("Running Reward", running_reward, episode_ite) writer.add_scalar("Episode Reward", ep_reward, episode_ite) episode_ite += 1 # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward) # Here we have collected N trajectories and prepare dataset history.build_dataset() data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True) policy_loss, value_loss, train_ite = train_network( data_loader, policy_model, value_model, policy_optimizer, value_optimizer, n_epoch, clip, train_ite, writer, entropy_coefficient) for p_l, v_l in zip(policy_loss, value_loss): epoch_ite += 1 writer.add_scalar("Policy Loss", p_l, epoch_ite) writer.add_scalar("Value Loss", v_l, epoch_ite) history.free_memory() # print("\n", running_reward) if (running_reward > env_threshold): print("\nSolved!") break
class SAC_Agent: def __init__(self, load_from=None, will_train=True): self.env = TorcsEnv( path='/usr/local/share/games/torcs/config/raceman/quickrace.xml') self.args = SAC_args() self.buffer = ReplayBuffer(self.args.buffer_size) action_dim = self.env.action_space.shape[0] state_dim = self.env.observation_space.shape[0] hidden_dim = 256 self.action_size = action_dim self.state_size = state_dim self.value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.target_value_net = ValueNetwork(state_dim, hidden_dim).to(self.args.device) self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(self.args.device) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.value_criterion = nn.MSELoss() self.soft_q_loss1 = nn.MSELoss() self.soft_q_loss2 = nn.MSELoss() self.value_opt = optim.Adam(self.value_net.parameters(), lr=self.args.lr) self.soft_q_opt1 = optim.Adam(self.soft_q_net1.parameters(), lr=self.args.lr) self.soft_q_opt2 = optim.Adam(self.soft_q_net2.parameters(), lr=self.args.lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=self.args.lr) if will_train: current_time = time.strftime('%d-%b-%y-%H.%M.%S', time.localtime()) self.plot_folder = f'plots/{current_time}' self.model_save_folder = f'model/{current_time}' make_sure_dir_exists(self.plot_folder) make_sure_dir_exists(self.model_save_folder) self.cp = Checkpoint(self.model_save_folder) if load_from is not None: try: self.load_checkpoint(load_from) except FileNotFoundError: print(f'{load_from} not found. Running default.') else: print('Starting from scratch.') def train(self): remove_log_file() clear_action_logs() eps_n = 0 rewards = [] test_rewards = [] best_reward = -np.inf info = None for eps_n in range(1, self.args.max_eps + 1): # Train loop self.set_mode('train') relaunch = (eps_n - 1) % (20 / self.args.test_rate) == 0 state = self.env.reset(relaunch=relaunch, render=False, sampletrack=False) eps_r = 0 sigma = (self.args.start_sigma - self.args.end_sigma) * (max( 0, 1 - (eps_n - 1) / self.args.max_eps)) + self.args.end_sigma randomprocess = OrnsteinUhlenbeckProcess(self.args.theta, sigma, self.action_size) for step in range(self.args.max_eps_time): # Episode action = self.policy_net.get_train_action(state, randomprocess) next_state, reward, done, info = self.env.step(action) self.buffer.push(state, action, reward, next_state, done) state = next_state eps_r += reward if len(self.buffer) > self.args.batch_size: self.update() if done: break rewards.append(eps_r) test_reward = self.test(eps_n) test_rewards.append(test_reward) if test_reward > best_reward: best_reward = test_reward self.save_checkpoint(eps_n, best_reward) info_str = ', '.join( [key for key in info.keys() if key != 'place']) info_str += f", {info['place']}. place" log(f'Episode {eps_n:<4} Reward: {eps_r:>7.2f} Test Reward: {test_reward:>7.2f} Info: {info_str}' ) if eps_n % self.args.plot_per == 0: self.plot(rewards, test_rewards, eps_n) def update(self): state, action, reward, next_state, done = self.buffer.sample( self.args.batch_size) state = FloatTensor(state).to(self.args.device) next_state = FloatTensor(next_state).to(self.args.device) action = FloatTensor(action).to(self.args.device) reward = FloatTensor(reward).unsqueeze(1).to(self.args.device) done = FloatTensor(np.float32(done)).unsqueeze(1).to(self.args.device) predicted_q_value1 = self.soft_q_net1(state, action) predicted_q_value2 = self.soft_q_net2(state, action) predicted_value = self.value_net(state) new_action, log_prob, epsilon, mean, log_std = self.policy_net.evaluate( state) # Training Q function target_value = self.target_value_net(next_state) target_q_value = reward + (1 - done) * self.args.gamma * target_value q_value_loss1 = self.soft_q_loss1(predicted_q_value1, target_q_value.detach()) q_value_loss2 = self.soft_q_loss2(predicted_q_value2, target_q_value.detach()) self.soft_q_opt1.zero_grad() q_value_loss1.backward() if self.args.clipgrad: self.clip_grad(self.soft_q_net1.parameters()) self.soft_q_opt1.step() self.soft_q_opt2.zero_grad() q_value_loss2.backward() if self.args.clipgrad: self.clip_grad(self.soft_q_net2.parameters()) self.soft_q_opt2.step() # Training Value function predicted_new_q_value = torch.min(self.soft_q_net1(state, new_action), self.soft_q_net2(state, new_action)) target_value_func = predicted_new_q_value - self.args.alpha * log_prob.sum( ) value_loss = self.value_criterion(predicted_value, target_value_func.detach()) self.value_opt.zero_grad() value_loss.backward() if self.args.clipgrad: self.clip_grad(self.value_net.parameters()) self.value_opt.step() # Training Policy function policy_loss = (log_prob - predicted_new_q_value).mean() self.policy_opt.zero_grad() policy_loss.backward() if self.args.clipgrad: self.clip_grad(self.policy_net.parameters()) self.policy_opt.step() # Updating target value network for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.args.soft_tau) + param.data * self.args.soft_tau) def test(self, eps_n): self.set_mode('eval') rewards = [] for step in range(self.args.test_rate): render = (eps_n % 30 == 0) and (step == 0) relaunch = render or ((eps_n % 30 == 0) and (step == 1)) state = self.env.reset(relaunch=relaunch, render=render, sampletrack=False) running_reward = 0 for t in range(self.args.max_eps_time): action = self.policy_net.get_test_action(state) state, reward, done, info = self.env.step(action) store(action, eps_n, reward, info, t == 0) running_reward += reward if done: break rewards.append(running_reward) avg_reward = sum(rewards) / self.args.test_rate return avg_reward def plot(self, rewards, test_rewards, eps_n): torch.save({ 'train_rewards': rewards, 'test_rewards': test_rewards }, f'{self.plot_folder}/{eps_n}.pth') figure = plt.figure() plt.plot(rewards, label='Train Rewards') plt.plot(test_rewards, label='Test Rewards') plt.xlabel('Episode') plt.legend() plt.savefig(f'{self.plot_folder}/{eps_n}.png') try: send_mail(f'Improved Torcs SAC | Episode {eps_n}', f'{self.plot_folder}/{eps_n}.png') log('Mail has been sent.') except (KeyboardInterrupt, SystemExit): print('KeyboardInterrupt or SystemExit') raise except Exception as e: print('Mail Exception occured:', e) emsg = e.args[-1] emsg = emsg[:1].lower() + emsg[1:] log('Couldn\'t send mail because', emsg) def clip_grad(self, parameters): for param in parameters: param.grad.data.clamp_(-1, 1) def set_mode(self, mode): if mode == 'train': self.value_net.train() self.target_value_net.train() self.soft_q_net1.train() self.soft_q_net2.train() self.policy_net.train() elif mode == 'eval': self.value_net.eval() self.target_value_net.eval() self.soft_q_net1.eval() self.soft_q_net2.eval() self.policy_net.eval() else: raise ValueError('mode should be either train or eval') def save_checkpoint(self, eps_n, test_reward): self.cp.update(self.value_net, self.soft_q_net1, self.soft_q_net2, self.policy_net) self.cp.save(f'e{eps_n}-r{test_reward:.4f}.pth') log(f'Saved checkpoint at episode {eps_n}.') def load_checkpoint(self, load_from): state_dicts = torch.load(load_from) self.value_net.load_state_dict(state_dicts['best_value']) self.soft_q_net1.load_state_dict(state_dicts['best_q1']) self.soft_q_net2.load_state_dict(state_dicts['best_q2']) self.policy_net.load_state_dict(state_dicts['best_policy']) print(f'Loaded from {load_from}.') def race(self, sampletrack=True): with torch.no_grad(): state = self.env.reset(relaunch=True, render=True, sampletrack=sampletrack) running_reward = 0 done = False while not done: action = self.policy_net.get_test_action(state) state, reward, done, info = self.env.step(action) running_reward += reward print('Reward:', running_reward)
class PolicyTraining(object): def __init__(self, config): self.config = config self.run_dir = util.run_directory(config) self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) self.policy_network = PolicyNetwork('policy') self.policy_player = PolicyPlayer(self.policy_network, self.session) util.restore_or_initialize_network(self.session, self.run_dir, self.policy_network) # Train ops self.create_train_op(self.policy_network) self.writer = tf.summary.FileWriter(self.run_dir) util.restore_or_initialize_scope(self.session, self.run_dir, self.training_scope.name) self.opponents = Opponents( [RandomPlayer(), RandomThreatPlayer(), MaxThreatPlayer()]) self.opponents.restore_networks(self.session, self.run_dir) def create_train_op(self, policy_network): with tf.variable_scope('policy_training') as self.training_scope: self.move = tf.placeholder(tf.int32, shape=[None], name='move') self.result = tf.placeholder(tf.float32, shape=[None], name='result') policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH]) move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1) turn = util.turn_win(policy_network.turn) move_probability = tf.reduce_sum(policy * move, axis=[1, 2]) result_loss = -tf.reduce_mean( tf.log(move_probability) * turn * self.result) entropy_regularisation = ( -config.entropy * tf.reduce_mean(policy_network.entropy)) loss = result_loss + entropy_regularisation optimizer = tf.train.AdamOptimizer(self.config.learning_rate) self.global_step = tf.contrib.framework.get_or_create_global_step() self.train_op = optimizer.minimize(loss, self.global_step) # Summary tf.summary.scalar('loss', loss) for var in policy_network.variables + policy_network.policy_layers: tf.summary.histogram(var.name, var) self.summary = tf.summary.merge_all() def train(self): for _ in range(self.config.batches): opponent = self.opponents.choose_opponent() games = self.play_games(opponent) step, summary = self.train_games(opponent, games) self.process_results(opponent, games, step, summary) if self.opponents.all_beaten(): name = self.opponents.next_network_name() print('All opponents beaten. Creating %s' % name) self.create_new_opponent(name) if step % 100 == 0: self.save() self.save() def save(self): util.save_network(self.session, self.run_dir, self.policy_network) util.save_scope(self.session, self.run_dir, self.training_scope.name) self.opponents.save_opponent_stats(self.run_dir) def play_games(self, opponent): # Create games games = incomplete_games = [Game() for _ in range(self.config.batch_size)] # Let opponent play first in half of the games self.play_move(games[0:len(games) // 2], opponent) player = self.policy_player while incomplete_games: self.play_move(incomplete_games, player) player = self.policy_player if player != self.policy_player else opponent incomplete_games = [ game for game in incomplete_games if not game.position.gameover() ] return games def play_move(self, games, player): positions = [game.position for game in games] moves = player.play(positions) for game, move in zip(games, moves): game.move(move, player == self.policy_player) def train_games(self, opponent, games): turn, disks, empty, legal_moves, threats, moves, results = ([], [], [], [], [], [], []) for game in games: for position, move in game.policy_player_moves: turn.append(position.turn) disks.append(position.disks) empty.append(position.empty) legal_moves.append(position.legal_moves) threats.append(position.threats) moves.append(move) results.append(game.result) _, step, summary = self.session.run( [self.train_op, self.global_step, self.summary], { self.policy_network.turn: turn, self.policy_network.disks: disks, self.policy_network.empty: empty, self.policy_network.legal_moves: legal_moves, self.policy_network.threats: threats, self.move: moves, self.result: results }) return step, summary def process_results(self, opponent, games, step, summary): win_rate = np.mean([game.policy_player_score for game in games]) average_moves = sum([len(game.moves) for game in games]) / self.config.batch_size opponent_summary = tf.Summary() opponent_summary.value.add( tag=self.training_scope.name + '/' + opponent.name + '/win_rate', simple_value=win_rate) opponent_summary.value.add( tag=self.training_scope.name + '/' + opponent.name + '/moves', simple_value=average_moves) self.writer.add_summary(summary, step) self.writer.add_summary(opponent_summary, step) self.opponents.update_win_rate(opponent, win_rate) print('Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves' % (step, opponent.name, win_rate, self.opponents.win_rates[opponent], average_moves)) def create_new_opponent(self, name): # Create clone of policy_player clone = PolicyNetwork(name) self.session.run(self.policy_network.assign(clone)) util.save_network(self.session, self.run_dir, clone) new_opponent = PolicyPlayer(clone, self.session) self.opponents.decrease_win_rates() self.opponents.add_opponent(new_opponent)
def trainD(file_name="Distral_1col", list_of_envs=[GridworldEnv(4), GridworldEnv(5)], batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Retuns rewards and durations logs. Plot environment screen """ # action dimension num_actions = list_of_envs[0].action_space.n # total envs num_envs = len(list_of_envs) # pi_0 policy = PolicyNetwork(num_actions) # Q value, every environment has one, used to calculate A_i, models = [DQN(num_actions) for _ in range(0, num_envs)] ### Add torch.nn.ModuleList (?) # replay buffer for env ??? memories = [ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs)] use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # device = "cpu" print(device) # model policy = policy.to(device) for i in range(len(models)): models[i] = models[i].to(device) # optimizer for every Q model optimizers = [optim.Adam(model.parameters(), lr=learning_rate) for model in models] # optimizer for policy policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) # optimizer = optim.RMSprop(model.parameters(), ) # info list for each environment episode_durations = [[] for _ in range(num_envs)] # list of local steps episode_rewards = [[] for _ in range(num_envs)] # list of list of episode reward episodes_done = np.zeros(num_envs) # episode num steps_done = np.zeros(num_envs) # global timesteps for each env current_time = np.zeros(num_envs) # local timesteps for each env # Initialize environments for env in list_of_envs: env.reset() while np.min(episodes_done) < num_episodes: policy.train() for model in models: model.train() # TODO: add max_num_steps_per_episode # Optimization is given by alterating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy # 1. do the step for each env for i_env, env in enumerate(list_of_envs): # print("Cur episode:", i_episode, "steps done:", steps_done, # "exploration factor:", eps_end + (eps_start - eps_end) * \ # math.exp(-1. * steps_done / eps_decay)) # last_screen = env.current_grid_map # ===========update step info begin======================== current_screen = get_screen(env) # state state = current_screen # - last_screen # action chosen by pi_1~pi_i action = select_action(state, policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta, device) # global_steps steps_done[i_env] += 1 # local steps current_time[i_env] += 1 # reward _, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # next state last_screen = current_screen current_screen = get_screen(env) if not done: next_state = current_screen # - last_screen else: next_state = None # add to buffer time = Tensor([current_time[i_env]]) memories[i_env].push(state, action, next_state, reward, time) # 2. do one optimization step for each env using "soft-q-learning". # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma, device) # ===========update step info end ======================== # ===========update episode info begin ==================== if done: print("ENV:", i_env, "iter:", episodes_done[i_env], "\treward:", env.episode_total_reward, "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) # reset env env.reset() # episode steps episodes_done[i_env] += 1 # append each episode local timesteps list for every env episode_durations[i_env].append(current_time[i_env]) # reset local timesteps current_time[i_env] = 0 # append total episode_reward to list episode_rewards[i_env].append(env.episode_total_reward) if is_plot: plot_rewards(episode_rewards, i_env) # ===========update episode info end ==================== # 3. do one optimization step for the policy # after all envs has performed one step, optimize policy optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, device) print('Complete') env.render(close=True) env.close() if is_plot: plt.ioff() plt.show() ## Store Results np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations
def trainD(file_name="Distral_2col_SQL", list_of_envs=[GridworldEnv(5), GridworldEnv(4), GridworldEnv(6)], batch_size=128, gamma=0.999, alpha=0.8, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): """ Soft Q-learning training routine. Returns rewards and durations logs. """ num_actions = list_of_envs[0].action_space.n input_size = list_of_envs[0].observation_space.shape[0] num_envs = len(list_of_envs) policy = PolicyNetwork(input_size, num_actions) models = [DQN(input_size, num_actions) for _ in range(0, num_envs)] memories = [ ReplayMemory(memory_replay_size, memory_policy_size) for _ in range(0, num_envs) ] optimizers = [ optim.Adam(model.parameters(), lr=learning_rate) for model in models ] policy_optimizer = optim.Adam(policy.parameters(), lr=learning_rate) episode_durations = [[] for _ in range(num_envs)] episode_rewards = [[] for _ in range(num_envs)] steps_done = np.zeros(num_envs) episodes_done = np.zeros(num_envs) current_time = np.zeros(num_envs) # Initialize environments states = [] for env in list_of_envs: states.append( torch.from_numpy(env.reset()).type(torch.FloatTensor).view( -1, input_size)) while np.min(episodes_done) < num_episodes: # TODO: add max_num_steps_per_episode # Optimization is given by alternating minimization scheme: # 1. do the step for each env # 2. do one optimization step for each env using "soft-q-learning". # 3. do one optimization step for the policy for i_env, env in enumerate(list_of_envs): # select an action action = select_action(states[i_env], policy, models[i_env], num_actions, eps_start, eps_end, eps_decay, episodes_done[i_env], alpha, beta) steps_done[i_env] += 1 current_time[i_env] += 1 next_state_tmp, reward, done, _ = env.step(action[0, 0]) reward = Tensor([reward]) # Observe new state next_state = torch.from_numpy(next_state_tmp).type( torch.FloatTensor).view(-1, input_size) if done: next_state = None # Store the transition in memory time = Tensor([current_time[i_env]]) memories[i_env].push(states[i_env], action, next_state, reward, time) # Perform one step of the optimization (on the target network) optimize_model(policy, models[i_env], optimizers[i_env], memories[i_env], batch_size, alpha, beta, gamma) # Update state states[i_env] = next_state # Check if agent reached target if done or current_time[i_env] >= max_num_steps_per_episode: if episodes_done[i_env] <= num_episodes: print( "ENV:", i_env, "iter:", episodes_done[i_env], "\treward:{0:.2f}".format(env.episode_total_reward), "\tit:", current_time[i_env], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i_env] / eps_decay)) episode_rewards[i_env].append(env.episode_total_reward) episodes_done[i_env] += 1 episode_durations[i_env].append(current_time[i_env]) current_time[i_env] = 0 states[i_env] = torch.from_numpy(env.reset()).type( torch.FloatTensor).view(-1, input_size) if is_plot: plot_rewards(episode_rewards, i_env) # Perform one step of the optimization on the Distilled policy optimize_policy(policy, policy_optimizer, memories, batch_size, num_envs, gamma, alpha, beta) print('Complete') env.render(close=True) env.close() ## Store Results np.save(file_name + '-rewards', episode_rewards) np.save(file_name + '-durations', episode_durations) return models, policy, episode_rewards, episode_durations
acc, len(test_loader.dataset), test_acc, )) return test_loss, test_acc tr_losses = [] te_losses = [] te_accs = [] batch_size = 128 num_epochs = 25 log_interval = 100 model = PolicyNetwork() train_dataset = TrainDataset() test_dataset = TestDataset() train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) test_loader = DataLoader(test_dataset, batch_size=batch_size * 2, shuffle=True, drop_last=True) model.to('cuda') optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = F.cross_entropy