def set_policy(self): env = gym.make(self.cfg['env_name']) set_seed(self.cfg['seed']) torch.set_default_tensor_type('torch.DoubleTensor') state_dim, action_dim, is_disc_action = get_gym_info(env) running_state = ZFilter((state_dim,), clip=5) if is_disc_action: policy_net = DiscretePolicy(state_dim, action_dim) else: policy_net = DiagnormalPolicy(state_dim, action_dim, log_std=self.cfg['log_std']) return env, running_state, policy_net
def env_factory(thread_id): env = gym.make(args.env_name) env.seed(args.seed + thread_id) return env state_dim, action_dim, is_disc_action = get_gym_info(env_factory) running_state = ZFilter((state_dim, ), clip=5) # Define actor, critic and their optimizers if is_disc_action: policy_net = DiscretePolicy(state_dim, action_dim) else: policy_net = DiagnormalPolicy(state_dim, action_dim, log_std=args.log_std) value_net = ValueFunction(state_dim) device = torch.device("cuda" if use_gpu and args.gpu else "cpu") if use_gpu and args.gpu: policy_net = policy_net.to(device) value_net = value_net.to(device) running_state = ZFilter((state_dim, ), clip=5) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.lr_policy) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.lr_value) cfg = Cfg(parse=args) agent = ActorCriticAgent("A2c" + args.dis, env_factory,
torch.set_default_tensor_type('torch.DoubleTensor') set_seed(args.seed) def env_factory(thread_id): env = gym.make(args.env_name) env.seed(args.seed + thread_id) return env state_dim, action_dim, is_disc_action = get_gym_info(env_factory) running_state = ZFilter((state_dim, ), clip=5) # Define actor, critic and their optimizers assert not is_disc_action policy_net = DiagnormalPolicy(state_dim, action_dim, log_std=args.log_std) value_net = ValueFunction(state_dim) device = torch.device("cuda" if use_gpu and args.gpu else "cpu") if use_gpu and args.gpu: policy_net = policy_net.to(device) value_net = value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) cfg = Cfg(parse=args) cfg["estimate_adv_and_target"] = False agent = ActorCriticAgent("TSac" + args.dis,
set_seed(args.seed) def env_factory(thread_id): env = gym.make(args.env_name) env.seed(args.seed + thread_id) return env state_dim, action_dim, is_disc_action = get_gym_info(env_factory) running_state = ZFilter((state_dim,), clip=5) # Define actor, critic and their optimizers assert not is_disc_action policy_net = AdditiveDiagnormalPolicy(state_dim, action_dim, args.policy_num, args.alpha, args.gpu) temporary_policy = DiagnormalPolicy(state_dim, action_dim) value_net = ValueFunction(state_dim) device = torch.device("cuda" if use_gpu and args.gpu else "cpu") if use_gpu and args.gpu: policy_net = policy_net.to(device) temporary_policy = temporary_policy.to(device) value_net = value_net.to(device) optimizer_policy = torch.optim.Adam(temporary_policy.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) cfg = Cfg(parse=args) agent = ActorCriticAgent("GbPPO" + args.dis, env_factory, temporary_policy, value_net, cfg, running_state=running_state) gbpo_ppo = GbpoUpdater(policy_net, temporary_policy, value_net, optimizer_policy, optimizer_value, cfg)
def env_factory(thread_id): env = gym.make(args.env_name) env.seed(args.seed + thread_id) return env set_seed(args.seed) torch.set_default_tensor_type('torch.DoubleTensor') state_dim, action_dim, _ = get_gym_info(env_factory) running_state = ZFilter((state_dim,), clip=5) # Define actor, critic and discriminator hidden = [10 * state_dim, math.ceil(math.sqrt(50 * state_dim)), 5] policy_net = DiagnormalPolicy(state_dim, action_dim, hidden=hidden, log_std=args.log_std) value_net = ValueFunction(state_dim) del hidden if args.variate == 'linear': variate_net = LinearVariate(state_dim, action_dim) elif args.variate == 'quadratic': variate_net = QuadraticVariate(state_dim, action_dim) else: variate_net = MlpVariate(state_dim, action_dim) nets = {'policy': policy_net, 'value': value_net, 'variate': variate_net} if use_gpu and args.gpu: