def main(): sess = tf.Session(config=cf.tf_config) agent = A2C(cf, sess) sess.run(tf.global_variables_initializer()) if bool(args.e): agent.evaluate(load_model=True) else: agent.learn() sess.close()
def main(): get_env_version() cfg = A2CConfig(env="CartPole-v0", train_frames=400) get_env_information(cfg.env) env = gym.make(cfg.env) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n agent = A2C(state_dim, action_dim, cfg) envs = get_envs(env_name=cfg.env) rewards, smooth_rewards = train(cfg, envs, agent) os.makedirs(cfg.result_path) # In fact, a step/frame contains nums_envs environmental interactions plot_rewards(rewards, smooth_rewards, env=cfg.env, algo=cfg.algo, save=True, path=cfg.result_path, xlabel_name="Each 200 steps") envs.close() env.close()
"action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy # if args.policy == "TD3": # # Target policy smoothing is scaled wrt the action scale # kwargs["policy_noise"] = args.policy_noise * max_action # kwargs["noise_clip"] = args.noise_clip * max_action # kwargs["policy_freq"] = args.policy_freq # policy = TD3.TD3(**kwargs) if args.policy == "A2C": envs = ParaEnv(args.env, args.n_processes, args.seed) policy = A2C.A2C(env.observation_space, env.action_space, args.discount, args.tau, max_episode_timesteps) x, y = policy.run(envs, file_name, args) write_result(args.env + "_A2C.json", x, y) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) x, y = policy.run(env, file_name, args) write_result(args.env + "_DDPG.json", x, y) elif args.policy == "REINFORCE": args.n_steps = 5 args.n_processes = 16 envs = ParaEnv(args.env, args.n_processes, args.seed) policy = REINFORCE.REINFORCE(env.observation_space, env.action_space, args.discount, args.tau, args.n_steps, args.n_processes, max_episode_timesteps)
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Maxime: commented this out because it very much changes the behavior # of the code for seemingly arbitrary reasons #if len(envs.observation_space.shape) == 1: # envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) elif args.recurrent_policy: actor_critic = RecMLPPolicy(obs_numel, envs.action_space) else: actor_critic = MLPPolicy(obs_numel, envs.action_space) # Maxime: log some info about the model and its size # call function PPO.modelsize() for this to happen ''' modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize ''' if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) if args.algo == 'a2c': Agent = A2C(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.alpha, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) elif args.algo == 'ppo': Agent = PPO(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) elif args.algo == 'acktr': Agent = ACKTR(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.alpha, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) print(str(actor_critic)) print('Total model size: %d' % Agent.modelsize()) obs = envs.reset() Agent.update_current_obs(obs, envs) Agent.rollouts.observations[0].copy_(Agent.current_obs) # These variables are used to compute average rewards for all processes. Agent.train(envs)
'gamma': 0.95, 'learning_rate': 1e-3, 'gae_lambda': 0.95, # lambda for Generalized Advantage Estimation 'rep_learning_rate': 1e-5, # learning rate for learning next state representation 'seed': seed } '''Parameters of Model''' model_parameters = {'num_units': 64, 'seed': seed} loss_coefficients = {'value': 0.5, 'entropy': 1e-2, 'representation': 0.0} '''Write Parameters to log_file''' if verbose: with open(log_file_name, "a") as f: f.write('Environment: {}, Frames: {} \n'.format(game, num_frames)) f.write('Algorithm Parameters: {} \n'.format(algorithm_parameters)) f.write('Model Parameters: {} \n'.format(model_parameters)) f.write('Loss Coefficients: {} \n'.format(loss_coefficients)) f.flush() '''Initialize Environment & Model''' env = Environment(game, seed) num_actions = env.number_of_actions state_space = env.state_space model = Model(num_actions, state_space, model_parameters) agent = A2C(model, num_actions, algorithm_parameters, loss_coefficients) '''Train the Agent''' reward_history, loss_history = agent.train(env, num_frames, logs, verbose) '''Save Rewards and Losses''' if verbose: np.save(reward_file_name, reward_history) np.save(loss_file_name, loss_history)
def train(cfg): print('Start to train ! \n') envs = make_envs(num_envs=16, env_name="CartPole-v0") state_dim = envs.observation_space.shape[0] action_dim = envs.action_space.n device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = A2C(state_dim, action_dim, hidden_dim=256) # moving_average_rewards = [] # ep_steps = [] log_dir = os.path.split( os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE writer = SummaryWriter(log_dir) state = envs.reset() for i_episode in range(1, cfg.train_eps + 1): log_probs = [] values = [] rewards = [] masks = [] entropy = 0 for i_step in range(1, cfg.train_steps + 1): state = torch.FloatTensor(state).to(device) dist, value = agent.model(state) action = dist.sample() next_state, reward, done, _ = envs.step(action.cpu().numpy()) state = next_state log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device)) if i_episode % 20 == 0: print("reward", test_env(agent, device='cpu')) next_state = torch.FloatTensor(next_state).to(device) _, next_value = agent.model(next_state) returns = agent.compute_returns(next_value, rewards, masks) log_probs = torch.cat(log_probs) returns = torch.cat(returns).detach() values = torch.cat(values) advantage = returns - values actor_loss = -(log_probs * advantage.detach()).mean() critic_loss = advantage.pow(2).mean() loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy agent.optimizer.zero_grad() loss.backward() agent.optimizer.step() for _ in range(100): print("test_reward", test_env(agent, device='cpu')) # print('Episode:', i_episode, ' Reward: %i' % # int(ep_reward[0]), 'n_steps:', i_step) # ep_steps.append(i_step) # rewards.append(ep_reward) # if i_episode == 1: # moving_average_rewards.append(ep_reward[0]) # else: # moving_average_rewards.append( # 0.9*moving_average_rewards[-1]+0.1*ep_reward[0]) # writer.add_scalars('rewards',{'raw':rewards[-1], 'moving_average': moving_average_rewards[-1]}, i_episode) # writer.add_scalar('steps_of_each_episode', # ep_steps[-1], i_episode) writer.close() print('Complete training!') ''' 保存模型 '''