for target_param, param in zip(target_value_net.parameters(), value_net.parameters()): target_param.data.copy_(param.data) value_criterion = nn.MSELoss() soft_q_criterion1 = nn.MSELoss() soft_q_criterion2 = nn.MSELoss() value_lr = 3e-4 soft_q_lr = 3e-4 policy_lr = 3e-4 value_optimizer = optim.Adam(value_net.parameters(), lr=value_lr) soft_q_optimizer1 = optim.Adam(soft_q_net1.parameters(), lr=soft_q_lr) soft_q_optimizer2 = optim.Adam(soft_q_net2.parameters(), lr=soft_q_lr) policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr) replay_buffer_size = 1000000 replay_buffer = ReplayBuffer(replay_buffer_size) max_frames = 40000 max_steps = 500 frame_idx = 0 rewards = [] batch_size = 128 while frame_idx < max_frames: state = env.reset() episode_reward = 0
def run_iterations(args): # Init model state_size = 16 action_size = 4 if args.env == "MountainCar-v0": state_size = 2 action_size = 3 if args.env == "Freeway-ram-v0": state_size = 128 action_size = 3 if args.env == "CartPole-v0": state_size = 4 action_size = 2 model = PolicyNetwork(state_size, action_size) optimizer = optim.Adam(model.parameters(), args.learning_rate) start_n = 4 reward_per_iteration = [] for i in range(args.max_iterations): # boolean for demo if not args.demo: state = to_tensor(ENV.reset(), state_size) else: # start_n, nde state van demo pakken om als start state te gebruiken # hoe deze te kiezen samen met max_iterations, elke start state paar keer doen of 1x? start_state = get_start_state(ENV, args.env, start_n) # probleem met ene environment ENV.env.s en andere ENV.env.state; misschien elegantere oplossing? if args.env == "FrozenLake-v0": ENV.env.s = start_state state = to_tensor(ENV.env.s, state_size) else: ENV.env.state = start_state state = to_tensor(ENV.env.state, state_size) reward_per_episode = [] episode_loss = 0 for step in range(args.max_steps): if args.render: ENV.render() action = select_action(model, state, get_epsilon(i), action_size) next_state, reward, done, _ = ENV.step(action) # take a random action # compute the q value q_val = compute_q_val(model, state, action) with torch.no_grad(): # Don't compute gradient info for the target (semi-gradient) next_state = to_tensor(next_state, state_size) target = compute_target(model, reward, next_state, done, args.discount_factor) # loss is measured from error between current and newly expected Q values loss = F.smooth_l1_loss(q_val, target) # backpropagation of loss to Neural Network (PyTorch magic) optimizer.zero_grad() loss.backward() optimizer.step() episode_loss += loss state = next_state reward_per_episode.append(reward) if done: break if i % args.print_every == 0: print("Reward", reward, sum(reward_per_episode)) print("Step {:6d} with loss: {:4f}".format(i, episode_loss)) reward_per_iteration.append(reward_per_episode) return reward_per_iteration