예제 #1
0
    # AGENT
    selector = actions.ArgmaxActionSelector()
    agent = agents.DQNAgent(net, selector, device=device)

    # RUNNER
    exp_source = runner.RunnerSourceFirstLast(
        env, agent,
        gamma=params["gamma"])  # increase the number of steps for the runner
    buffer = ExperienceReplayBuffer(exp_source,
                                    buffer_size=params["replay_size"])
    optimizer = optim.Adam(net.parameters(), lr=params["learning_rate"])

    frame_idx = 0

    # TRAIN
    with logger.RewardTracker(writer, params["stop_reward"]) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)

            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if reward_tracker.reward(new_rewards[0], frame_idx):
                    break

            if len(buffer) < params["replay_initial"]:
                continue

            # learning step
            optimizer.zero_grad()
            batch = buffer.sample(params["batch_size"])
예제 #2
0
파일: a3c.py 프로젝트: djbyrne/RL_Workbench
    net.share_memory()

    optimizer = optim.Adam(net.parameters(),
                           lr=params["learning_rate"],
                           eps=1e-3)

    # initialise processes
    train_queue, data_proc_list = init_procs(envs, params)

    batch = []
    step_idx = 0

    # TRAINING
    try:
        with logger.RewardTracker(net,
                                  writer,
                                  stop_reward=params["stop_reward"],
                                  tag="a3c") as tracker:
            while True:
                train_entry = train_queue.get()
                if isinstance(train_entry, TotalReward):
                    if tracker.reward(train_entry.reward, step_idx):
                        break
                    continue

                step_idx += 1
                batch.append(train_entry)

                if len(batch) < params["batch_size"]:
                    continue

                loss_policy, loss_v = calc_a2c_loss(batch, net, params)
예제 #3
0
    tgt_crt_net = ptan.agent.TargetNet(crt_net)

    writer = SummaryWriter(comment="-ddpg_" + args.name)
    agent = agents.AgentDDPG(act_net, device=device)
    exp_source = runner.RunnerSourceFirstLast(env,
                                              agent,
                                              gamma=GAMMA,
                                              steps_count=1)
    buffer = memory.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
    act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE)
    crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE)

    frame_idx = 0
    best_reward = None

    with logger.RewardTracker(act_net, writer, 200) as tracker:
        with ptan.common.utils.TBMeanTracker(writer,
                                             batch_size=10) as tb_tracker:
            while True:
                frame_idx += 1
                buffer.populate(1)
                rewards_steps = exp_source.pop_rewards_steps()
                if rewards_steps:
                    rewards, steps = zip(*rewards_steps)
                    tb_tracker.track("episode_steps", steps[0], frame_idx)
                    tracker.reward(rewards[0], frame_idx)

                if len(buffer) < REPLAY_INITIAL:
                    continue

                batch = buffer.sample(BATCH_SIZE)
예제 #4
0
                                       apply_softmax=True,
                                       device=device)

    # RUNNER
    exp_source = runner.RunnerSourceFirstLast(envs,
                                              agent,
                                              gamma=params["gamma"],
                                              steps_count=params["step_count"])
    optimizer = optim.Adam(net.parameters(),
                           lr=params["learning_rate"],
                           eps=1e-3)

    batch = []

    # TRAINING
    with logger.RewardTracker(net, writer, stop_reward=195,
                              tag="a2c") as tracker:
        for step_idx, exp in enumerate(exp_source):
            batch.append(exp)

            # handle new rewards
            new_rewards = exp_source.pop_total_rewards()
            if new_rewards:
                if tracker.reward(new_rewards[0], step_idx):
                    break

            if len(batch) < params["batch_size"]:
                continue

            loss_policy, loss_v = calc_a2c_loss(batch, net, params)
            batch.clear()