# LOGGING writer = SummaryWriter(comment="-" + params["run_name"] + "-noisy") # NETWORK net = dqn_noisy_net.Network(env.observation_space.shape, env.action_space.n).to(device) tgt_net = agents.TargetNetwork(net) # AGENT selector = actions.ArgmaxActionSelector() agent = agents.DQNAgent(net, selector, device=device) # RUNNER exp_source = runner.RunnerSourceFirstLast( env, agent, gamma=params["gamma"]) # increase the number of steps for the runner buffer = ExperienceReplayBuffer(exp_source, buffer_size=params["replay_size"]) optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"]) frame_idx = 0 # TRAIN with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) new_rewards = exp_source.pop_total_rewards() if new_rewards:
env = gym.make(ENV_ID) test_env = gym.make(ENV_ID) act_net = ddpg_mlp.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = ddpg_mlp.DDPGCritic(env.observation_space.shape[0], env.action_space.shape[0]).to(device) print(act_net) print(crt_net) tgt_act_net = ptan.agent.TargetNet(act_net) tgt_crt_net = ptan.agent.TargetNet(crt_net) writer = SummaryWriter(comment="-ddpg_" + args.name) agent = agents.AgentDDPG(act_net, device=device) exp_source = runner.RunnerSourceFirstLast(env, agent, gamma=GAMMA, steps_count=1) buffer = memory.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None with logger.RewardTracker(act_net, writer, 200) as tracker: with ptan.common.utils.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps()
# NETWORK net = dqn_mlp_net.Network(observation_space, action_space, hidden_layer_size=64).to(device) tgt_net = agents.TargetNetwork(net) # AGENT selector = actions.EpsilonGreedyActionSelector( epsilon=params["epsilon_start"]) epsilon_tracker = logger.EpsilonTracker(selector, params) agent = agents.DQNAgent(net, selector, device=device) # RUNNER exp_source = runner.RunnerSourceFirstLast(env, agent, gamma=params["gamma"], steps_count=1) buffer = ExperienceReplayBuffer(exp_source, buffer_size=params["replay_size"]) optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"]) frame_idx = 0 done = False # TRAIN with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker: while True: frame_idx += 1 buffer.populate(1) epsilon_tracker.frame(frame_idx)
# LOGGING writer = SummaryWriter(comment="-" + params["run_name"] + "-reinforce") # NETWORK net = dqn_mlp_net.Network(observation_space, action_space, hidden_layer_size=64).to(device) # AGENT agent = agents.PolicyGradientAgent(net, preprocessor=utils.float32_preprocessor, apply_softmax=True) # RUNNER exp_source = runner.RunnerSourceFirstLast(env, agent, gamma=params["gamma"]) optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"]) total_rewards = [] step_idx = 0 done_episodes = 0 batch_episodes = 0 batch_states, batch_actions, batch_qvals = [], [], [] cur_rewards = [] with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker: for step_idx, exp in enumerate(exp_source): batch_states.append(exp.state) batch_actions.append(int(exp.action))