def main(): learning_rate = 0.001 discount = 0.995 beta = 0.4 eps = 0.05 K_epoch = 3 num_steps = 128 envs = [make_env() for _ in range(num_envs)] envs = SubprocVecEnv(envs) model = CNNTradingAgent(num_features=envs.reset().shape[-1], n_actions=2 * n_action_intervals + 1).to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) print_interval = 10 scores_list = [] loss_list = [] for n_epi in range(10000): # 게임 1만판 진행 n_epi += 1 loss = 0.0 log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories( envs, model, num_steps) # raise Exception("True" if torch.any(torch.isnan(torch.stack(states))) else "False") if beta > 0.01: beta *= discount for _ in range(K_epoch): L = -clipped_surrogate(envs, model, log_probs, states, actions, rewards, discount, eps, beta) optimizer.zero_grad() L.backward() optimizer.step() loss += L.item() del L score = np.asarray(rewards).sum(axis=0).mean() scores_list.append(score) loss_list.append(loss) if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score : {:.4f}, loss : {:.6f}".format( n_epi, score / print_interval, loss / print_interval)) print("actions : ", torch.cat(actions)) if n_epi % save_interval == 0: torch.save(model.state_dict(), os.path.join(save_location, f'TradingGym_{n_epi}.pth')) torch.save(scores_list, os.path.join(save_location, f"{n_epi}_scores.pth")) # plt.plot(scores_list) # plt.title("Reward") # plt.grid(True) # plt.savefig(os.path.join(save_location,f'{n_epi}_ppo.png')) # plt.close() del envs
def run(num_envs=16, hidden_dim=256, batch_size=1024, iterations=1000, log_interval=10, runs=1): envs = [tl.make_nh_waypoint_3d() for i in range(num_envs)] envs = SubprocVecEnv(envs) t_env = tenv.WaypointEnv3D() state_dim = t_env.observation_space.shape[0] action_dim = t_env.action_space.shape[0] path = os.getcwd() + "/nh_waypoint_3d/" for i in range(runs): agent = ag.Agent(state_dim, hidden_dim, action_dim, dim=3) opt = torch.optim.Adam(agent.parameters(), lr=1e-4) ep, rew, agent = tl.train_mp(envs, t_env, agent, opt, batch_size, iterations, log_interval, render=False, fname=path + "gaussian_" + str(2)) if i == 0: csv_input = pd.DataFrame() csv_input["timesteps"] = ep csv_input["run" + str(i)] = rew csv_input.to_csv(path + "data.csv", index=False)
def __init__(self, numOfEnvs): self.testRewards = [] # self.num_envs = 16 # self.num_envs = numOfEnvs self.num_envs = 6 self.env_name = "Pendulum-v0" self.env = gym.make(self.env_name) self.envs = [self.make_env() for i in range(self.num_envs)] self.envs = SubprocVecEnv(self.envs) self.num_inputs = self.envs.observation_space.shape[0] self.num_outputs = self.envs.action_space.shape[0] #Hyper params: self.hidden_size = 256 self.lr = 3e-3 self.model = ActorCritic(self.num_inputs, self.num_outputs, self.hidden_size).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
def make_envs(num_envs=16,env_name="Pendulum-v0"): ''' 创建多个子环境 ''' num_envs = 16 env_name = "CartPole-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) return envs
def run(num_envs=16, hidden_dim=256, batch_size=1024, iterations=1000, log_interval=10, runs=3): envs = [tl.make_term_3d() for i in range(num_envs)] envs = SubprocVecEnv(envs) t_env = tenv.TrajectoryEnvTerm() t_env.num_fut_wp = int(cfg.waypoints - 1) state_size = 5 + 15 * (t_env.num_fut_wp + 1) t_env.observation_space = gym.spaces.Box(-1, 1, shape=(state_size, )) state_dim = t_env.observation_space.shape[0] action_dim = t_env.action_space.shape[0] path = os.getcwd() + "/_3d/term_3d/" for i in range(runs): agent = ag.Agent(state_dim, hidden_dim, action_dim, dim=3, lookahead=lookahead) opt = torch.optim.Adam(agent.parameters(), lr=cfg.lr) ep, rew, term_rew, agent = tl.train_term_mp(envs, t_env, agent, opt, batch_size, iterations, log_interval, render=False, fname=path + wps + "-wps") if i == 0: csv_input = pd.DataFrame() csv_input["iterations"] = ep term_csv_input = pd.DataFrame() term_csv_input["iterations"] = ep csv_input["run" + str(i)] = rew term_csv_input["run" + str(i)] = term_rew csv_input.to_csv(path + "data_wp-" + wps + ".csv", index=False) term_csv_input.to_csv(path + "term_data_wp-" + wps + ".csv", index=False)
board[2] = observation['bomb_blast_strength'] return board def makeTrainingObservation(): env = Pomme(**config["env_kwargs"]) agents = {} for agent_id in range(num_players): agent = TrainingAgent(config["agent"](agent_id, config["game_type"])) agents[agent_id] = agent env.set_agents(list(agents.values())) env.set_init_game_state(None) return env if __name__ == '__main__': envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = (3,11,11) num_actions = envs.action_space.n #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e6) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5
def main(): envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.n model = ActorCritic(num_inputs, num_outputs, hidden_size, hd2_size).to(device) optimizer = optim.Adam(model.parameters()) max_frames = 10000 frame_idx = 0 test_rewards = [] state = envs.reset() while frame_idx < max_frames: log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) state = next_state frame_idx += 1 next_state = torch.FloatTensor(next_state).to(device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy print(f'\rframe: {frame_idx}\t loss: {loss}', end='') if frame_idx % 100 == 0: rewards, scores = map( list, zip(*((test_env(model, False) for _ in range(10))))) avg_rewards = np.mean(rewards) avg_scores = np.mean(scores) print( f'\rframe: {frame_idx}\t avg_rewards: {avg_rewards:.2f}\t avg_scores: {avg_scores:.2f}\t loss: {loss}' ) optimizer.zero_grad() loss.backward() optimizer.step() ((test_env(model, True) for _ in range(10))) envs.close()
import matplotlib.pyplot as plt import matplotlib.animation as animation from a2c import ActorCritic from policy import * def env_fn(): env = gym.make('cube-x3-v0') env.unwrapped._refreshScrambleParameters(1, 2, scramble_easy=True) return env actions = env_fn().unwrapped.action_list envs = SubprocVecEnv([env_fn]) obs = envs.reset() envs.render(0) action_list = [] fig = plt.figure() ims = [] im = plt.imshow(cube_gym.onehotToRGB(obs[0])) ims.append([im]) with tf.Session() as sess: actor_critic = ActorCritic(sess, CnnPolicy, envs.observation_space.shape,
import sys sys.path.append('./common') from common.multiprocessing_env import SubprocVecEnv num_envs = 16 env_name = "Pendulum-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) # Neural Network def init_weights(m): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, mean=0., std=0.1) nn.init.constant_(m.bias, 0.1) class ActorCritic(nn.Module): def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0): super(ActorCritic, self).__init__() self.critic = nn.Sequential(
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'): #Minigrid maze env env_name = "MiniGrid-BlockMaze-v0" def make_env(env_name): return lambda: gym_minigrid.wrappers.PadImgObsWrapper(gym.make(env_name)) envs = [make_env(env_name) for i in range(N_ENVS)] envs = SubprocVecEnv(envs) ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() with tf.Session() as sess: actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, policy, summarize) if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc) dones = [False for _ in range(N_ENVS)] nbatch = N_ENVS * N_STEPS episode_rewards = np.zeros((N_ENVS, )) final_rewards = np.zeros((N_ENVS, )) for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)): # mb stands for mini batch mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] for n in range(N_STEPS): actions, values, _ = actor_critic.act(obs) mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(dones) obs, rewards, dones, _ = envs.step(actions) #print(obs[0:3, :,:,0]) episode_rewards += rewards masks = 1 - np.array(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) mb_dones.append(dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = actor_critic.critique(obs).tolist() #discount/bootstrap off value fn for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() d = d.tolist() if d[-1] == 0: rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1] else: rewards = discount_with_dones(rewards, d, GAMMA) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() if summarize: loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update, summary_op) writer.add_summary(summary, update) else: loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update) if update % LOG_INTERVAL == 0 or update == 1: print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy)) print(final_rewards.mean()) if update % SAVE_INTERVAL == 0: print('Saving model') actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt') actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
from common.multiprocessing_env import SubprocVecEnv num_envs = 16 env_name = "Pendulum-v0" def make_env(): def make(): env = gym.make(env_name) return env return make envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) STATE_DIM = env.observation_space.shape[0] ACTION_DIM = env.action_space.shape[0] ACTION_MAX = env.action_space.high[0] SAMPLE_NUMS = 100 TARGET_UPDATE_STEP = 10 CLIP_PARAM = 0.3 FloatTensor = torch.FloatTensor LongTensor = torch.LongTensor ByteTensor = torch.ByteTensor Tensor = FloatTensor
logger = Logger() def make_cuda(input): if USE_CUDA: return input.cuda() return input def make_env(): def _thunk(): env = Key_Collect() return env return _thunk if __name__ == '__main__': # important for windows systems if subprocesses are run envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 10 num_frames = int(1e6) #Init a2c and rmsprop actor_critic = ActorCritic(state_shape, num_actions)
class Ppo: def __init__(self, numOfEnvs): self.testRewards = [] # self.num_envs = 16 # self.num_envs = numOfEnvs self.num_envs = 6 self.env_name = "Pendulum-v0" self.env = gym.make(self.env_name) self.envs = [self.make_env() for i in range(self.num_envs)] self.envs = SubprocVecEnv(self.envs) self.num_inputs = self.envs.observation_space.shape[0] self.num_outputs = self.envs.action_space.shape[0] #Hyper params: self.hidden_size = 256 self.lr = 3e-3 self.model = ActorCritic(self.num_inputs, self.num_outputs, self.hidden_size).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) def make_env(self): def _thunk(): env = gym.make(self.env_name) return env return _thunk # def compute_gae(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95): def compute_gae(self, next_value, rewards, masks, values, g, t): gamma = float(g) tau = float(t) values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): for _ in range(ppo_epochs): for state, action, old_log_probs, return_, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages): dist, value = self.model(state) entropy = dist.entropy().mean() new_log_probs = dist.log_prob(action) ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage actor_loss = - torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def plot(self, frame_idx, rewards): clear_output(True) plt.figure(figsize=(20,5)) plt.subplot(131) plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1])) plt.plot(rewards) plt.show() # plt.savefig("{0}/{1}_rewardGraph.png".format(saveGraphPath, frame_idx)) def test_env(self, vis=False): state = self.env.reset() if vis: self.env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(device) dist, _ = self.model(state) next_state, reward, done, _ = self.env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: self.env.render() total_reward += reward return total_reward def main(self, inputVals): gam = inputVals[0] lam = inputVals[1] print ("Gam: ", gam) print ("Lam: ", lam) num_inputs = self.envs.observation_space.shape[0] num_outputs = self.envs.action_space.shape[0] #Hyper params: # hidden_size = 256 # lr = 3e-3 num_steps = 20 mini_batch_size = 5 ppo_epochs = 4 threshold_reward = -200 # model = a.ActorCritic(num_inputs, num_outputs, hidden_size).to(device) # optimizer = optim.Adam(self.model.parameters(), lr=lr) max_frames = 12000 # max_frames = 2000 frame_idx = 0 self.test_rewards = [] state = self.envs.reset() early_stop = False while frame_idx < max_frames and not early_stop: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = 0 for _ in range(num_steps): state = torch.FloatTensor(state).to(device) dist, value = self.model(state) action = dist.sample() next_state, reward, done, _ = self.envs.step(action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 if frame_idx % 1000 == 0: test_reward = np.mean([self.test_env() for _ in range(10)]) self.test_rewards.append(test_reward) self.plot(frame_idx, self.test_rewards) if test_reward > threshold_reward: early_stop = True print ("rewards: ", test_reward) next_state = torch.FloatTensor(next_state).to(device) _, next_value = self.model(next_state) returns = self.compute_gae(next_value, rewards, masks, values, gam, lam) returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values lastLoss = self.ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage) # print ("loss: ", [lastLoss]) # re = rewards[-1].cpu() # print ("RE: ", np.asarray(re)) # return (np.asarray(re)) return lastLoss.item()
from hyperparameters import * from common.multiprocessing_env import SubprocVecEnv from model import Net, Brain from envs import make_env from tqdm import tqdm import numpy as np seed_num = 1 torch.manual_seed(seed_num) if use_cuda: torch.cuda.manual_seed(seed_num) # 실행환경 구축 torch.set_num_threads(seed_num) envs = [make_env(ENV_NAME, seed_num, i) for i in range(NUM_PROCESSES)] envs = SubprocVecEnv(envs) # 멀티프로세스 실행환경 n_out = envs.action_space.n # 행동의 가짓수는 4 actor_critic = Net(n_out).to(device) # GPU 사용 global_brain = Brain(actor_critic) # 정보 저장용 변수 생성 obs_shape = envs.observation_space.shape # (1, 84, 84) obs_shape = (obs_shape[0] * NUM_STACK_FRAME, *obs_shape[1:]) # (4, 84, 84) # torch.Size([16, 4, 84, 84]) current_obs = torch.zeros(NUM_PROCESSES, *obs_shape).to(device) rollouts = RolloutStorage( NUM_ADVANCED_STEP, NUM_PROCESSES, obs_shape) # rollouts 객체 episode_rewards = torch.zeros([NUM_PROCESSES, 1]) # 현재 에피소드에서 받을 보상 저장 final_rewards = torch.zeros([NUM_PROCESSES, 1]) # 마지막 에피소드의 총 보상 저장
returns.insert(0, gae + values[step]) return returns if __name__ == '__main__': num_envs = 8 def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(env_name) num_inputs = envs.observation_space.shape[0] num_outputs = envs.action_space.shape[0] hidden_size = 256 lr = 3e-2 num_steps = 20 model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device) optimizer = optim.Adam(model.parameters(), lr) max_frames = 100000 frame_idx = 0 test_rewards = [] state = envs.reset()
env_name = 'Pendulum-v0' gamma = 0.9 num_envs = 12 # num_envs 가 크면 오류발생 가능 max_frame = 50000 actor_lr = 0.0003 critic_lr = 0.001 max_grad_norm = 0.7 n_steps = 50 max_episode_steps = 500 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if __name__ == '__main__': envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)]) envs.set_max_episode_steps(max_episode_steps) actor = Actor().to(device) critic = Critic().to(device) a_solver = optim.Adam(actor.parameters(), lr=actor_lr) c_solver = optim.Adam(critic.parameters(), lr=critic_lr) frame_count = 0 rewards = [[0.] for _ in range(num_envs)] global_rewards = [] obs_gotten = None while frame_count < max_frame: cache = {'obs': [], 'acts': [], 'rews': [], 'dones': []}
n_updates = 4 frame_idx = 0 scores_list = [] def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) class ActorCritic(nn.Module): def __init__(self): super().__init__() self.conv1 = nn.Conv2d(in_channels=2, out_channels=4, kernel_size=6, stride=2, bias=False) nn.init.orthogonal_(self.conv1.weight, np.sqrt(2)) #The second convolution layer takes a 20x20 frame and produces a 9x9 frame self.conv2 = nn.Conv2d(
def train(env_fn=None, spectrum=False, vae_arch=None, a2c_arch=None, nenvs=16, nsteps=100, max_iters=1e6, kl_coeff=0.5, lr=7e-4, log_interval=100, summarize=True, vae_load_path=None, a2c_load_path=None, log_path=None, cpu_cores=1): # Construct the vectorized parallel environments envs = [env_fn for _ in range(nenvs)] envs = SubprocVecEnv(envs) # Set some random seeds for the environment envs.seed(0) if spectrum: envs.spectrum() ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores, intra_op_parallelism_threads=cpu_cores) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: actor_critic = RandomActorCritic(sess, a2c_arch, ob_space, ac_space, nenvs, nsteps) if a2c_load_path is not None: actor_critic.load(a2c_load_path) print('Loaded a2c') else: actor_critic.epsilon = -1 print('WARNING: No Actor Critic Model loaded. Using Random Agent') vae = VariationalAutoEncoder(sess, vae_arch, ob_space, ac_space, lr, kl_coeff, summarize) load_count = 0 if vae_load_path is not None: vae.load(vae_load_path) summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) print('VAE Training Start!') print('Model will be saved on intervals of %i' % (log_interval)) for i in tqdm(range(load_count + 1, int(max_iters) + 1), ascii=True, desc='VarAutoEncoder'): mb_s, mb_a, mb_r, mb_ns, mb_d = [], [], [], [], [] for s, a, r, ns, d in model_play_games(actor_critic, envs, nsteps): mb_s.append(s) mb_a.append(a) mb_r.append(r) mb_ns.append(ns) mb_d.append(d) mb_s = np.concatenate(mb_s) mb_a = np.concatenate(mb_a) mb_r = np.concatenate(mb_r) mb_ns = np.concatenate(mb_ns) mb_d = np.concatenate(mb_d) if summarize: loss, recon_loss, kl_loss, _, smy = vae.train( mb_s, mb_a, mb_ns, mb_r, summary_op) writer.add_summary(smy, i) else: loss, recon_loss, kl_loss, _ = vae.train( mb_s, mb_a, mb_ns, mb_r) if i % log_interval == 0: vae.save(log_path, i) vae.save(log_path, 'final') print('Variational AutoEncoder is finished training')
np.random.seed(2019) from common.multiprocessing_env import SubprocVecEnv num_envs = 1 #original is 16 env_name = "Pendulum-v0" def make_env(): def _thunk(): env = gym.make(env_name) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) class ActorCritic: def __init__(self, sess, obs, acs, hidden_size, name, trainable, init_std=1.0): self.sess = sess self.obs = obs self.acs = acs
def train(env_fn=None, spectrum=False, a2c_arch=None, nenvs=16, nsteps=100, max_iters=1e6, gamma=0.99, pg_coeff=1.0, vf_coeff=0.5, ent_coeff=0.01, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, log_interval=100, summarize=True, load_path=None, log_path=None, cpu_cores=1): # Construct the vectorized parallel environments envs = [env_fn for _ in range(nenvs)] envs = SubprocVecEnv(envs) # Set some random seeds for the environment envs.seed(0) if spectrum: envs.spectrum() ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores, intra_op_parallelism_threads=cpu_cores) tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: actor_critic = ActorCritic(sess, a2c_arch, ob_space, ac_space, pg_coeff, vf_coeff, ent_coeff, max_grad_norm, lr, alpha, epsilon, summarize) load_count = 0 if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (-1, nw, nh, nc) dones = [False for _ in range(nenvs)] episode_rewards = np.zeros((nenvs, )) final_rewards = np.zeros((nenvs, )) print('a2c Training Start!') print('Model will be saved on intervals of %i' % (log_interval)) for i in tqdm(range(load_count + 1, int(max_iters) + 1), ascii=True, desc='ActorCritic'): # Create the minibatch lists mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_depth = [], [], [], [], [], [] total_reward = 0 for n in range(nsteps): # Get the actions and values from the actor critic, we don't need neglogp actions, values, neglogp = actor_critic.act(obs) mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(dones) obs, rewards, dones, info = envs.step(actions) total_reward += np.sum(rewards) episode_rewards += rewards masks = 1 - np.array(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) mb_depth.append( np.array( [info_item['scramble_depth'] for info_item in info])) mb_dones.append(dones) # Convert batch steps to batch rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes( 1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0) mb_depth = np.asarray(mb_depth, dtype=np.int32).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = actor_critic.critique(obs).tolist() # discounting for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() d = d.tolist() if d[-1] == 0: rewards = discount_with_dones(rewards + [value], d + [0], gamma)[:-1] else: rewards = discount_with_dones(rewards, d, gamma) mb_rewards[n] = rewards # Flatten the whole minibatch mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() mb_depth = mb_depth.flatten() # Save the information to tensorboard if summarize: loss, policy_loss, value_loss, policy_ent, mrew, mdp, _, summary = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_depth, i, summary_op) writer.add_summary(summary, i) else: loss, policy_loss, value_loss, policy_ent, mrew, mdp, _ = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, mb_depth, i) if i % log_interval == 0: actor_critic.save(log_path, i) actor_critic.save(log_path, 'final') print('a2c model is finished training')
# plot(frame_idx, test_rewards) next_state = torch.FloatTensor(next_state).to(cfg.device) _, next_value = model(next_state) returns = compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy optimizer.zero_grad() loss.backward() optimizer.step() return test_rewards, test_ma_rewards if __name__ == "__main__": cfg = A2CConfig() envs = [make_envs(cfg.env) for i in range(cfg.n_envs)] envs = SubprocVecEnv(envs) # 8 env rewards, ma_rewards = train(cfg, envs) make_dir(cfg.result_path, cfg.model_path) save_results(rewards, ma_rewards, tag='train', path=cfg.result_path) plot_rewards(rewards, ma_rewards, tag="train", env=cfg.env, algo=cfg.algo, path=cfg.result_path)
# Create Environments num_envs = 4 env_name = 'CartPole-v0' def make_env(): def _thunk(): env = gym.make(env_name) env.seed(seed) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) env = gym.make(env_name) env.seed(seed) # Neural Network class NonSpikingLIFNode(neuron.LIFNode): class NonSpikingLIFNode(neuron.LIFNode): def forward(self, dv: torch.Tensor): self.neuronal_charge(dv) # self.neuronal_fire() # self.neuronal_reset() return self.v
target_param.data.copy_(target_param.data * (1.0 - soft_tau) + param.data * soft_tau) def make_env(env_id): def _thunk(): '''멀티 프로세스로 동작하는 환경 SubprocVecEnv를 실행하기 위해 필요하다''' env = gym.make(env_id) env = NormalizedActions(env) return env return _thunk envs = [make_env("Pendulum-v0") for i in range(NUM_PROCESS)] envs = SubprocVecEnv(envs) # 멀티프로세스 실행환경 ou_noise = OUNoise(envs.action_space) state_dim = envs.observation_space.shape[0] action_dim = envs.action_space.shape[0] hidden_dim = 256 value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device) target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device) for target_param, param in zip(target_value_net.parameters(), value_net.parameters()): target_param.data.copy_(param.data)
# In[ ]: num_envs = 4 def make_env(): def _thunk(): env = SokobanEnv() return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 120 num_batch = int(10e5) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99
break env_t.close() env_name = 'CartPole-v1' gamma = 0.99 num_envs = 8 PENALTY = -1.0 n_step = 4 max_frame = 50000 lr = 0.001 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if __name__ == '__main__': envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)]) net = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2)) actor = Actor(4, 128, 2).to(device) critic = Critic(4, 128).to(device) solver = optim.Adam( list(actor.parameters()) + list(critic.parameters()), lr) duration = [] frame_count = 0 lifespan = [[0] for _ in range(num_envs)] s_gotten = None while frame_count * n_step < max_frame: obs_l, acts_l, rews_l, dones_l, probs_l = [], [], [], [], [] accept_sample = [True for _ in range(num_envs)] for _ in range(n_step):
gamma = 0.99 batch_size = 64 lr = 0.001 initial_exploration = 1000 update_target = 200 replay_memory_capacity = 30000 max_frame = 100000 PENALTY = -1.0 num_envs = 8 if __name__ == '__main__': import warnings warnings.filterwarnings("ignore", category=UserWarning) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)]) net = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2)) agent = Model(net, 2).to(device) solver = optim.Adam(agent.parameters()) memory = Memory(replay_memory_capacity) eps = 1.0 duration = [] frame_count = 0 lifespan = [[0] for _ in range(num_envs)] s_gotten = None while frame_count < max_frame: s = envs.reset() if s_gotten is None else s_gotten preprocessed_s = torch.FloatTensor(s) a = agent.response(preprocessed_s, eps) s_gotten, r, done, _ = envs.step(a)