def load_checkpoint(file_dir, i_epoch, layer_sizes, input_size, device='cuda'): checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch), map_location=device) policy_net = PolicyNet(layer_sizes).to(device) policy_net.load_state_dict(checkpoint["policy_net"]) policy_net.train() policy_lr = checkpoint["policy_lr"] policynet_optim = optim.Adam(policy_net.parameters(), lr=policy_lr) policynet_optim.load_state_dict(checkpoint["policynet_optim"]) checkpoint.pop("policy_net") checkpoint.pop("policynet_optim") checkpoint.pop("i_epoch") checkpoint.pop("policy_lr") return policy_net, policynet_optim, checkpoint
episode_rewards = [] for i_episode in range(batch_size): # Keep track of the running reward running_reward = 0 # Initialize the environment and state current_state = env.reset() # Store the first state and value estimate in memory memory.set_initial_state(current_state) for t in count(): # Make sure that policy net and value net is in training mode policy_net.train() # Sample an action given the current state action, log_prob = policy_net( torch.tensor([current_state], device=device)) log_prob = log_prob.squeeze() # Interact with the environment next_state, reward, done, _ = env.step(action.item()) running_reward += reward # Render this episode if render and (render_each_episode or (not finished_rendering_this_epoch)): env.render()