Ejemplo n.º 1
0
    def set_policy(self):
        env = gym.make(self.cfg['env_name'])
        set_seed(self.cfg['seed'])
        torch.set_default_tensor_type('torch.DoubleTensor')
        state_dim, action_dim, is_disc_action = get_gym_info(env)
        running_state = ZFilter((state_dim,), clip=5)

        if is_disc_action:
            policy_net = DiscretePolicy(state_dim, action_dim)
        else:
            policy_net = DiagnormalPolicy(state_dim, action_dim, log_std=self.cfg['log_std'])
        return env, running_state, policy_net
Ejemplo n.º 2
0

def env_factory(thread_id):
    env = gym.make(args.env_name)
    env.seed(args.seed + thread_id)
    return env


state_dim, action_dim, is_disc_action = get_gym_info(env_factory)
running_state = ZFilter((state_dim, ), clip=5)

# Define actor, critic and their optimizers
if is_disc_action:
    policy_net = DiscretePolicy(state_dim, action_dim)
else:
    policy_net = DiagnormalPolicy(state_dim, action_dim, log_std=args.log_std)
value_net = ValueFunction(state_dim)

device = torch.device("cuda" if use_gpu and args.gpu else "cpu")
if use_gpu and args.gpu:
    policy_net = policy_net.to(device)
    value_net = value_net.to(device)

running_state = ZFilter((state_dim, ), clip=5)

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.lr_policy)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.lr_value)

cfg = Cfg(parse=args)
agent = ActorCriticAgent("A2c" + args.dis,
                         env_factory,
Ejemplo n.º 3
0
torch.set_default_tensor_type('torch.DoubleTensor')
set_seed(args.seed)


def env_factory(thread_id):
    env = gym.make(args.env_name)
    env.seed(args.seed + thread_id)
    return env


state_dim, action_dim, is_disc_action = get_gym_info(env_factory)
running_state = ZFilter((state_dim, ), clip=5)

# Define actor, critic and their optimizers
assert not is_disc_action
policy_net = DiagnormalPolicy(state_dim, action_dim, log_std=args.log_std)
value_net = ValueFunction(state_dim)

device = torch.device("cuda" if use_gpu and args.gpu else "cpu")
if use_gpu and args.gpu:
    policy_net = policy_net.to(device)
    value_net = value_net.to(device)

optimizer_policy = torch.optim.Adam(policy_net.parameters(),
                                    lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(),
                                   lr=args.learning_rate)

cfg = Cfg(parse=args)
cfg["estimate_adv_and_target"] = False
agent = ActorCriticAgent("TSac" + args.dis,
Ejemplo n.º 4
0
set_seed(args.seed)


def env_factory(thread_id):
    env = gym.make(args.env_name)
    env.seed(args.seed + thread_id)
    return env


state_dim, action_dim, is_disc_action = get_gym_info(env_factory)
running_state = ZFilter((state_dim,), clip=5)

# Define actor, critic and their optimizers
assert not is_disc_action
policy_net = AdditiveDiagnormalPolicy(state_dim, action_dim, args.policy_num, args.alpha, args.gpu)
temporary_policy = DiagnormalPolicy(state_dim, action_dim)
value_net = ValueFunction(state_dim)

device = torch.device("cuda" if use_gpu and args.gpu else "cpu")
if use_gpu and args.gpu:
    policy_net = policy_net.to(device)
    temporary_policy = temporary_policy.to(device)
    value_net = value_net.to(device)

optimizer_policy = torch.optim.Adam(temporary_policy.parameters(), lr=args.learning_rate)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)

cfg = Cfg(parse=args)
agent = ActorCriticAgent("GbPPO" + args.dis, env_factory, temporary_policy,
                         value_net, cfg, running_state=running_state)
gbpo_ppo = GbpoUpdater(policy_net, temporary_policy, value_net, optimizer_policy, optimizer_value, cfg)
Ejemplo n.º 5
0

def env_factory(thread_id):
    env = gym.make(args.env_name)
    env.seed(args.seed + thread_id)
    return env


set_seed(args.seed)
torch.set_default_tensor_type('torch.DoubleTensor')
state_dim, action_dim, _ = get_gym_info(env_factory)
running_state = ZFilter((state_dim,), clip=5)

# Define actor, critic and discriminator
hidden = [10 * state_dim, math.ceil(math.sqrt(50 * state_dim)), 5]
policy_net = DiagnormalPolicy(state_dim, action_dim, hidden=hidden, log_std=args.log_std)
value_net = ValueFunction(state_dim)
del hidden

if args.variate == 'linear':
    variate_net = LinearVariate(state_dim, action_dim)
elif args.variate == 'quadratic':
    variate_net = QuadraticVariate(state_dim, action_dim)
else:
    variate_net = MlpVariate(state_dim, action_dim)

nets = {'policy': policy_net,
        'value': value_net,
        'variate': variate_net}

if use_gpu and args.gpu: