Exemple #1
0
def execute(args, params, device):
    utils.kill_game_processes()
    env = main.make_env(args, params)

    result_name1, writer1, net1, tgt_net1, selector1, epsilon_tracker1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0)
    result_name2, writer2, net2, tgt_net2, selector2, epsilon_tracker2, agent2, exp_source2, buffer2, optimizer2 = main.make_components(args, params, device, env, 1)

    frame = 0
    frame_idx1 = 0
    frame_idx2 = 0
    eval_states1 = None
    eval_states2 = None

    date_time = datetime.now().strftime("%b%d_%H-%M-%S")
    with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1, \
            common.RewardTracker(writer2, params['stop_reward_player2'], net2, date_time + result_name2 + ".dat", 1, env) as reward_tracker2:

        # fill histories
        main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1)
        main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2)

        while True:
            if frame // args.units % 2 == 0:
                frame_idx1 += 1
                if main.train(args, params, device, buffer1, epsilon_tracker1, frame_idx1, exp_source1, reward_tracker1, selector1, optimizer1, net1, tgt_net1, writer1, eval_states1):
                    break
            else:
                frame_idx2 += 1
                if main.train(args, params, device, buffer2, epsilon_tracker2, frame_idx2, exp_source2, reward_tracker2, selector2, optimizer2, net2, tgt_net2, writer2, eval_states2):
                    break

            frame += 1

            if args.maxFrames > 0 and frame_idx1 > args.maxFrames:
                break
Exemple #2
0
def execute(args, params, device):
    utils.kill_game_processes()
    env = main.make_env(args, params)

    result_name1, writer1, net1, tgt_net1, agent1, exp_source1, buffer1, optimizer1 = main.make_components(args, params, device, env, 0)
    net2 = ptan.agent.TargetNet(net1)
    agent2 = ptan.agent.DQNAgent(lambda x: net1.qvals(x), ptan.actions.ArgmaxActionSelector(), device=device)

    frame = 0
    frame_idx1 = 0

    date_time = datetime.now().strftime("%b%d_%H-%M-%S")
    with common.RewardTracker(writer1, params['stop_reward_player1'], net1, date_time + result_name1 + ".dat", 0, env) as reward_tracker1:

        # fill history
        main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1)

        while True:
            if frame // args.units % 2 == 0:
                state, _, _, _  = env.step((1, -1))
                action, _ = agent2([state])
                state, reward, done, _ = env.step((1, action[0]))
                if done:
                    state = env.reset()
            else:
                frame_idx1 += 1
                if main.train(params, buffer1, device, frame_idx1, exp_source1, reward_tracker1, optimizer1, net1, tgt_net1, writer1):
                    break

            if args.maxFrames > 0 and frame_idx1 > args.maxFrames:
                break

            frame += 1
            if frame % NET_SYNC == 0:
                net2.sync()
def execute(args, params, device):
    utils.kill_game_processes()

    env = main.make_env(args, params)

    net1 = dqn_model.RainbowDQN(env.observation_space.shape,
                                env.action_space.n)
    net1.load_state_dict(
        torch.load(args.model1, map_location=lambda storage, loc: storage))

    agent1 = ptan.agent.DQNAgent(lambda x: net1.qvals(x),
                                 ptan.actions.ArgmaxActionSelector(),
                                 device=torch.device("cpu"))

    result_name = "-" + "-rainbow" + "-scenario=" + args.scenario + "-units=" + str(
        args.units)
    writer1 = SummaryWriter(comment=result_name + "-player0")

    env.reset()

    total_reward1 = 0.0
    counter1 = collections.Counter()

    epsilon = 0.02
    frame_idx1 = 0

    with common.RewardTracker(writer1, 100, net1, "x.dat", 0,
                              env) as reward_tracker1:

        while True:
            frame_idx1 += 1
            if np.random.random() < epsilon:
                action = [env.action_space.sample()]
            else:
                state, _, _, _ = env.step((0, -1))
                action, _ = agent1([state], [None])

            counter1[action[0]] += 1
            _, reward, done, _ = env.step((0, action[0]))

            total_reward1 += reward
            if done:
                reward_tracker1.reward(total_reward1, frame_idx1)
                total_reward1 = 0.0

                env.reset()

                net1.load_state_dict(
                    torch.load(args.model1,
                               map_location=lambda storage, loc: storage))

            if args.maxFrames > 0 and frame_idx1 > args.maxFrames:
                break
def execute(args, params, device):
    utils.kill_game_processes()
    env = main.make_env(args, params)

    result_name, writer, net, tgt_net, agent, exp_source, buffer, optimizer = main.make_components(
        args, params, device, env, 0)

    frame_idx = 0

    date_time = datetime.now().strftime("%b%d_%H-%M-%S")
    with common.RewardTracker(writer, params['stop_reward_player1'], net,
                              date_time + result_name + ".dat", 0,
                              env) as reward_tracker:
        while True:
            frame_idx += 1
            if main.train(params, buffer, device, frame_idx, exp_source,
                          reward_tracker, optimizer, net, tgt_net, writer):
                break

            if args.maxFrames > 0 and frame_idx > args.maxFrames:
                break