예제 #1
0
dynamics = DynamicsModel(model_net,
                         buffer,
                         model_loss_fnc,
                         model_opt,
                         env,
                         model_type='diff')

policy_net = SequentialNetwork(network_layers['policy_layers'])
value_net = QnetContinuousActions(env)
policy_opt = optim.Adam(policy_net.parameters(),
                        lr=learning_rates['policy_lr'])
value_opt = optim.Adam(value_net.parameters(),
                       lr=learning_rates['value_lr'],
                       weight_decay=1e-2)
actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau)
critic = ValueFunction(value_net, value_opt, target_net=True, tau=tau)

# Gather data and training
# ~~~~~~~~~~~~~~~~~~~~~~~~
global_step = buffer.populate_randomly(env, 0.1)
dynamics.train_model(20, params['minibatch_size'], noise_std=0.001)
buffer.empty()

for episode in tqdm(range(num_episodes)):
    episode_step = 0
    episode_reward = 0
    state = env.env.reset()
    terminal = False
    while terminal is False:
        with torch.no_grad():
            action = torch.clamp(
예제 #2
0
    ]
}
learning_rates = dict(policy_lr=1e-4, value_lr=1e-3)
critic_loss_fnc = torch.nn.SmoothL1Loss()
wandb.config.update(network_layers)
wandb.config.update(learning_rates)

# Initialisation
# ~~~~~~~~~~~~~~
policy_net = SequentialNetwork(network_layers['policy_layers'])
value_net = SequentialNetwork(network_layers['value_layers'])
policy_opt = optim.Adam(policy_net.parameters(),
                        lr=learning_rates['policy_lr'])
value_opt = optim.Adam(value_net.parameters(), lr=learning_rates['value_lr'])
actor = PolicyFunction(policy_net, policy_opt)
critic = ValueFunction(value_net, value_opt)
buffer = ReplayMemory(params['buffer_size'])

# Get training
# ~~~~~~~~~~~~
global_step = 0
for episode in tqdm(range(num_episodes)):
    episode_reward = 0
    episode_step = 0
    state = env.env.reset()
    terminal = False
    while terminal is False:
        action, action_log_prob = actor.softmax_action(state)
        next_state, reward, terminal, _ = env.env.step(action)
        wandb.log({'reward': reward, 'step': global_step, 'episode': episode})
        episode_step += 1
예제 #3
0
tau = 0.001
wandb.config.update(learning_rates)
wandb.config.critic_loss_fnc = critic_loss_fnc
wandb.config.tau = tau

# Initialisation
# ~~~~~~~~~~~~~~
policy_net = GaussianPolicy(env)
value_net1 = QnetContinuousActions(env)
value_net2 = QnetContinuousActions(env)
policy_opt = optim.Adam(policy_net.parameters(),
                        lr=learning_rates['policy_lr'])
value_opt1 = optim.Adam(value_net1.parameters(), lr=learning_rates['value_lr'])
value_opt2 = optim.Adam(value_net2.parameters(), lr=learning_rates['value_lr'])
actor = SACPolicy(policy_net, policy_opt)
critic1 = ValueFunction(value_net1, value_opt1, target_net=True, tau=tau)
critic2 = ValueFunction(value_net2, value_opt2, target_net=True, tau=tau)
buffer = ReplayMemory(params['buffer_size'])

# Get training
# ~~~~~~~~~~~~
global_step = 0
for episode in tqdm(range(num_episodes)):
    episode_reward = 0
    episode_step = 0
    state = env.env.reset()
    terminal = False
    while terminal is False:
        with torch.no_grad():
            action, action_log_prob = actor.action_selection(
                torch.from_numpy(state).float())
예제 #4
0
critic_loss_fnc = torch.nn.MSELoss()
tau = 0.001
wandb.config.update(learning_rates)
wandb.config.tau = tau

# Initialisation
# ~~~~~~~~~~~~~~
policy_net = DeterministicPolicy(env)
value_net = QnetContinuousActions(env)
policy_opt = optim.Adam(policy_net.parameters(),
                        lr=learning_rates['policy_lr'])
value_opt = optim.Adam(value_net.parameters(),
                       lr=learning_rates['value_lr'],
                       weight_decay=1e-2)
actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau)
critic = ValueFunction(value_net, value_opt, target_net=True, tau=tau)
buffer = ReplayMemory(params['buffer_size'])

# Get training
# ~~~~~~~~~~~~
global_step = 0
for episode in tqdm(range(num_episodes)):
    episode_reward = 0
    episode_step = 0
    state = env.env.reset()
    terminal = False
    while terminal is False:
        with torch.no_grad():
            action = torch.clamp(
                actor.get_policy(state) +
                action_noise.sample(env.action_high.shape), -1, 1)
예제 #5
0
                          model_type='diff')
dynamics = [dynamics1, dynamics2]

policy_net = SequentialNetwork(network_layers['policy_layers'])
value_net1 = QnetContinuousActions(env)
value_net2 = QnetContinuousActions(env)
policy_opt = optim.Adam(policy_net.parameters(),
                        lr=learning_rates['policy_lr'])
value_opt1 = optim.Adam(value_net1.parameters(),
                        lr=learning_rates['value_lr'],
                        weight_decay=1e-2)
value_opt2 = optim.Adam(value_net2.parameters(),
                        lr=learning_rates['value_lr'],
                        weight_decay=1e-2)
actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau)
critic1 = ValueFunction(value_net1, value_opt1, target_net=True, tau=tau)
critic2 = ValueFunction(value_net2, value_opt2, target_net=True, tau=tau)
critics = [critic1, critic2]

# Gather data and training
# ~~~~~~~~~~~~~~~~~~~~~~~~
global_step = buffer.populate_randomly(env, 0.1)
for dynamic in dynamics:
    dynamic.train_model(20, params['minibatch_size'], noise_std=0.001)

for episode in tqdm(range(num_episodes)):
    episode_reward = 0
    episode_step = 0
    state = env.env.reset()
    terminal = False
    while terminal is False:
예제 #6
0
tau = 0.001
policy_delay = 2
wandb.config.update(learning_rates)
wandb.config.tau = tau
wandb.config.policy_delay = policy_delay

# Initialisation
# ~~~~~~~~~~~~~~
policy_net = DeterministicPolicy(env)
value_net1 = QnetContinuousActions(env)
value_net2 = QnetContinuousActions(env)
policy_opt = optim.Adam(policy_net.parameters(), lr=learning_rates['policy_lr'])
value_opt1 = optim.Adam(value_net1.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2)
value_opt2 = optim.Adam(value_net2.parameters(), lr=learning_rates['value_lr'], weight_decay=1e-2)
actor = PolicyFunction(policy_net, policy_opt, target_net=True, tau=tau)
critic1 = ValueFunction(value_net1, value_opt1, target_net=True, tau=tau)
critic2 = ValueFunction(value_net2, value_opt2, target_net=True, tau=tau)
buffer = ReplayMemory(params['buffer_size'])

# Get training
# ~~~~~~~~~~~~
global_step = 0
for episode in tqdm(range(num_episodes)):
    # env = gym.make('Pendulum-v0')
    # if episode % 50 == 0:
    #     env = gym.wrappers.Monitor(env, f'./td3_video/{episode}', force=True)
    episode_reward = 0
    episode_step = 0
    state = env.env.reset()
    terminal = False
    while terminal is False:
예제 #7
0
                                  nn.ReLU(),
                                  nn.Linear(32, 64),
                                  nn.ReLU(),
                                  nn.Linear(64, env.action_size)],
                  'dueling_layers': [env.obs_size, 64, 128, env.action_size]}
learning_rates = {'Qnet_lr': 2e-4}
loss_function = torch.nn.SmoothL1Loss()
wandb.config.update(network_layers)
wandb.config.update(learning_rates)

# Initialisation
# ~~~~~~~~~~~~~~
net = SequentialNetwork(network_layers['Qnet_layers'])
wandb.config.value_layers = net.layers
opt = optim.Adam(net.parameters(), lr=learning_rates['Qnet_lr'])
Qnet = ValueFunction(net, opt, epsilon=epsilon, target_net=True)
buffer = ReplayMemory(params['buffer_size'])

# Get training
# ~~~~~~~~~~~~
global_step = 0
for episode in tqdm(range(num_episodes)):
    episode_reward = 0
    episode_step = 0
    state = env.env.reset()
    terminal = False
    while terminal is False:
        action = Qnet.epsilon_greedy_action(state, episode)
        next_state, reward, terminal, _ = env.env.step(action)
        wandb.log({'reward': reward, 'step': global_step, 'episode': episode})
        episode_step += 1