Beispiel #1
0
    # LOGGING
    writer = SummaryWriter(comment="-" + params["run_name"] + "-noisy")

    # NETWORK
    net = dqn_noisy_net.Network(env.observation_space.shape,
                                env.action_space.n).to(device)
    tgt_net = agents.TargetNetwork(net)

    # AGENT
    selector = actions.ArgmaxActionSelector()
    agent = agents.DQNAgent(net, selector, device=device)

    # RUNNER
    exp_source = runner.RunnerSourceFirstLast(
        env, agent,
        gamma=params["gamma"])  # increase the number of steps for the runner
    buffer = ExperienceReplayBuffer(exp_source,
                                    buffer_size=params["replay_size"])
    optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"])

    frame_idx = 0

    # TRAIN
    with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
Beispiel #2
0
    env = gym.make(ENV_ID)
    test_env = gym.make(ENV_ID)

    act_net = ddpg_mlp.DDPGActor(env.observation_space.shape[0],
                                 env.action_space.shape[0]).to(device)
    crt_net = ddpg_mlp.DDPGCritic(env.observation_space.shape[0],
                                  env.action_space.shape[0]).to(device)
    print(act_net)
    print(crt_net)
    tgt_act_net = ptan.agent.TargetNet(act_net)
    tgt_crt_net = ptan.agent.TargetNet(crt_net)

    writer = SummaryWriter(comment="-ddpg_" + args.name)
    agent = agents.AgentDDPG(act_net, device=device)
    exp_source = runner.RunnerSourceFirstLast(env,
                                              agent,
                                              gamma=GAMMA,
                                              steps_count=1)
    buffer = memory.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)

    frame_idx = 0
    best_reward = None

    with logger.RewardTracker(act_net, writer, 200) as tracker:
        with ptan.common.utils.TBMeanTracker(writer,
                                             batch_size=10) as tb_tracker:
            while True:
                frame_idx += 1
                buffer.populate(1)
                rewards_steps = exp_source.pop_rewards_steps()
Beispiel #3
0
    # NETWORK
    net = dqn_mlp_net.Network(observation_space,
                              action_space,
                              hidden_layer_size=64).to(device)
    tgt_net = agents.TargetNetwork(net)

    # AGENT
    selector = actions.EpsilonGreedyActionSelector(
        epsilon=params["epsilon_start"])
    epsilon_tracker = logger.EpsilonTracker(selector, params)
    agent = agents.DQNAgent(net, selector, device=device)

    # RUNNER
    exp_source = runner.RunnerSourceFirstLast(env,
                                              agent,
                                              gamma=params["gamma"],
                                              steps_count=1)
    buffer = ExperienceReplayBuffer(exp_source,
                                    buffer_size=params["replay_size"])
    optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"])

    frame_idx = 0
    done = False

    # TRAIN
    with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)
            epsilon_tracker.frame(frame_idx)
Beispiel #4
0
    # LOGGING
    writer = SummaryWriter(comment="-" + params["run_name"] + "-reinforce")

    # NETWORK
    net = dqn_mlp_net.Network(observation_space,
                              action_space,
                              hidden_layer_size=64).to(device)

    # AGENT
    agent = agents.PolicyGradientAgent(net,
                                       preprocessor=utils.float32_preprocessor,
                                       apply_softmax=True)

    # RUNNER
    exp_source = runner.RunnerSourceFirstLast(env,
                                              agent,
                                              gamma=params["gamma"])
    optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"])

    total_rewards = []
    step_idx = 0
    done_episodes = 0

    batch_episodes = 0
    batch_states, batch_actions, batch_qvals = [], [], []
    cur_rewards = []

    with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker:
        for step_idx, exp in enumerate(exp_source):
            batch_states.append(exp.state)
            batch_actions.append(int(exp.action))