def policy_iteration(env, policy, epsilon):
    q = init_state_action_map(env)
    visits_map = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, policy)
        on_policy_evaluation(episode, q, visits_map)
        epsilon_greedy_policy_improvement(env, episode, q, policy, epsilon)
    return q
Esempio n. 2
0
def policy_iteration(env, target_policy, behavior_policy):
    q = init_state_action_map(env)
    c = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, behavior_policy)
        off_policy_evaluation(episode, q, c, target_policy, behavior_policy)
        greedy_stochastic_policy_improvement(env, episode, q, target_policy)
    return q
def main():
    env = Blackjack()
    policy = init_policy(env)
    v = init_state_map(env)
    visits_map = init_state_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, policy)
        on_policy_state_evaluation(episode, v, visits_map)
    env.visualize_state_value(v)
def main():
    env = Blackjack()
    target_policy = init_policy(env)
    behavior_policy = init_equiprobable_random_policy(env)
    q = init_state_action_map(env)
    c = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, behavior_policy)
        off_policy_evaluation(episode, q, c, target_policy, behavior_policy)
    env.visualize_action_value(q)
Esempio n. 5
0
def play_from_file(filename):
    model = ForwardModel(input_shape=13, n_actions=7)
    model.load_state_dict(torch.load(filename))

    team_red = [PGAgent(2, "red", model), PGAgent(3, "red", model)]
    team_blue = [Agent(0, "blue"), Agent(1, "blue")]

    agents = team_blue + team_red

    env = Environment(agents)
    _ = generate_episode(env, args, render=True)
Esempio n. 6
0
def policy_iteration2(env, target_policy, behavior_policy):
    q = init_state_action_map(env)
    c = init_state_action_map(env)
    for _ in xrange(20000):
        episode = generate_episode(env, behavior_policy)
        fine_grained_off_policy_iteration(episode,
                                          q,
                                          c,
                                          target_policy,
                                          behavior_policy,
                                          gamma=1)
    return q
Esempio n. 7
0
def test_transferability(args, filename):

    team_blue = [Agent(idx, "blue") for idx in range(args.n_friends)]
    team_red = [
        PGAgent(args.n_friends + idx, "red") for idx in range(args.n_enemies)
    ]
    agents = team_blue + team_red

    if args.env_type == 'normal':
        env = Environment(agents, args)
    if args.env_type == 'restricted':
        env = Environment2(agents, args)

    args.n_actions = 6 + args.n_enemies
    args.n_inputs = 4 + 3 * (args.n_friends - 1) + 3 * args.n_enemies
    model = ForwardModel(input_shape=args.n_inputs, n_actions=args.n_actions)
    model.load_state_dict(torch.load(args.path + filename))
    model.eval()
    for agent in team_red:
        agent.set_model(model)

    epi_len, nwins = 0, 0
    n_episodes = 0
    for step_idx in range(40):
        batch = []
        for _ in range(args.n_episodes_per_step):
            episode = generate_episode(env)
            n_episodes += 1
            batch.extend(episode)

            epi_len += len(episode)
            reward = episode[-1].rewards["blue"]

            ex.log_scalar('length', len(episode))
            ex.log_scalar('reward', reward)
            ex.log_scalar(f'win_blue', int(episode[-1].rewards["blue"] == 1))
            ex.log_scalar(f'win_red', int(episode[-1].rewards["red"] == 1))

            if episode[-1].rewards["blue"] == 1:
                nwins += 1

        s = f"Step {step_idx}: "
        s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - "
        s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - "
        print(s)
        epi_len, nwins = 0, 0
Esempio n. 8
0
def train(args):
    team_blue = [IQLAgent(idx, "blue") for idx in range(args.n_friends)]
    team_red = [
        Agent(idx + args.n_friends, "red") for idx in range(args.n_enemies)
    ]

    training_agents = team_blue

    agents = team_blue + team_red
    if args.env_type == 'normal':
        env = Environment(agents, args)
    if args.env_type == 'restricted':
        env = Environment2(agents, args)

    args.n_actions = 6 + args.n_enemies  # 6 fixed actions + 1 aim action per enemy
    args.n_inputs = 4 + 3 * (
        args.n_friends -
        1) + 3 * args.n_enemies  # see process function in models.py
    models = generate_models(args.n_inputs, args.n_actions)
    for agent in training_agents:
        agent.set_model(models)

    buffer = ReplayBuffer(size=args.buffer_size)
    epi_len, nwins = 0, 0

    ex.log_scalar(f'win', 0.0, step=0)  # forces start of run at 0 wins ()
    for step_idx in range(args.n_steps):
        episode = generate_episode(env)
        buffer.insert_list(episode)
        if not buffer.can_sample(args.batch_size):
            continue

        epi_len += len(episode)
        reward = episode[-1].rewards["blue"]
        if episode[-1].rewards["blue"] == 1:
            nwins += 1
        batch = buffer.sample(args.batch_size)
        for agent in training_agents:
            loss = agent.update(batch)
            if step_idx > 0 and step_idx % args.sync_interval == 0:
                agent.sync_models(
                )  # TODO: same models get synced for all agents => to correct

            ex.log_scalar(f'loss{agent.id}', loss, step=step_idx)
            ex.log_scalar(f'epsilon', agent.scheduler(), step=step_idx)

        if step_idx > 0 and step_idx % PRINT_INTERVAL == 0:
            s = f"Step {step_idx}: loss: {loss:8.4f} - "
            s += f"Average length: {epi_len/PRINT_INTERVAL:5.2f} - "
            s += f"win ratio: {nwins/PRINT_INTERVAL:4.3f} - "
            s += f"epsilon: {agent.scheduler():4.3f} - "
            print(s)
            epi_len, nwins = 0, 0
            #_ = generate_episode(env, render=True)

        ex.log_scalar(f'length', len(episode), step=step_idx + 1)
        ex.log_scalar(f'win',
                      int(episode[-1].rewards["blue"] == 1),
                      step=step_idx + 1)
        ex.log_scalar(f'reward', reward, step=step_idx + 1)

    from os.path import expanduser
    home = expanduser("~")
    #for agent in training_agents:
    #    agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p')
    torch.save(models["model"].state_dict(),
               home + args.path + f'RUN_{get_run_id()}.torch')
Esempio n. 9
0
def train(args):
    team_blue = [PGAgent(idx, "blue") for idx in range(args.n_friends)]
    team_red = [
        Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies)
    ]

    training_agents = team_blue

    agents = team_blue + team_red
    if args.env_type == 'normal':
        env = Environment(agents, args)
    elif args.env_type == 'restricted':
        env = RestrictedEnvironment(agents, args)

    args.n_actions = 6 + args.n_enemies
    args.n_inputs = 4 + 3 * (args.n_friends -
                             1) + 3 * args.n_enemies + args.n_enemies

    # setup model
    if args.model == 'FORWARD':
        model = ForwardModel(input_shape=args.n_inputs,
                             n_actions=args.n_actions)
    elif args.model == 'RNN':
        model = RNNModel(input_shape=args.n_inputs,
                         n_actions=args.n_actions,
                         args=args)

    for agent in training_agents:
        agent.set_model(model)

    epi_len, nwins = 0, 0
    n_episodes = 0
    ex.log_scalar(f'win', 0.0,
                  step=n_episodes + 1)  # forces start of run at 0 wins ()
    for step_idx in range(int(args.n_steps / args.n_episodes_per_step)):
        batch = []
        for _ in range(args.n_episodes_per_step):
            episode = generate_episode(env, args)
            n_episodes += 1
            batch.extend(episode)

            epi_len += len(episode)
            reward = episode[-1].rewards["blue"]

            ex.log_scalar('length', len(episode), step=n_episodes)
            ex.log_scalar('reward', reward, step=n_episodes)
            ex.log_scalar(f'win',
                          int(episode[-1].rewards["blue"] == 1),
                          step=n_episodes + 1)

            if episode[-1].rewards["blue"] == 1:
                nwins += 1

        for agent in training_agents:
            stats = agent.update(batch)
            ex.log_scalar(f'loss{agent.id}', stats["loss"], step=n_episodes)
            ex.log_scalar(f'grads{agent.id}',
                          stats["grads_l2"],
                          step=n_episodes)
            ex.log_scalar(f'grads_var{agent.id}',
                          stats["grads_var"],
                          step=n_episodes)

        s = f"Step {step_idx}: "
        s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - "
        s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - "
        print(s)
        epi_len, nwins = 0, 0

        #_ = generate_episode(env, render=True)

    from os.path import expanduser
    home = expanduser("~")
    #for agent in training_agents:
    #    agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p')
    torch.save(model.state_dict(),
               home + args.path + f'RUN_{get_run_id()}.torch')
Esempio n. 10
0
def train():
    team_blue = [IACAgent(idx, "blue") for idx in range(args.n_friends)]
    team_red = [
        Agent(args.n_friends + idx, "red") for idx in range(args.n_enemies)
    ]

    training_agents = team_blue

    agents = team_blue + team_red
    if args.env_type == 'normal':
        env = Environment(agents, args)
    if args.env_type == 'restricted':
        env = Environment2(agents, args)

    args.n_actions = 6 + args.n_enemies
    args.n_inputs = 4 + 3 * (args.n_friends - 1) + 3 * args.n_enemies

    # setup model
    models = generate_model(input_shape=args.n_inputs,
                            n_actions=args.n_actions)

    for agent in training_agents:
        agent.set_models(models)

    epi_len, nwins = 0, 0
    n_episodes = 0
    ex.log_scalar(f'win', 0.0, step=0)  # forces start of run at 0 wins ()
    for step_idx in range(int(args.n_steps / args.n_episodes_per_step)):
        batch = []
        for _ in range(args.n_episodes_per_step):
            episode = generate_episode(env)
            n_episodes += 1
            batch.extend(episode)

            epi_len += len(episode)
            reward = episode[-1].rewards["blue"]

            ex.log_scalar('length', len(episode), step=n_episodes)
            ex.log_scalar('reward', reward, step=n_episodes)
            ex.log_scalar(f'win',
                          int(episode[-1].rewards["blue"] == 1),
                          step=n_episodes + 1)

            if episode[-1].rewards["blue"] == 1:
                nwins += 1

        for agent in training_agents:
            stats = agent.update(batch)
            ex.log_scalar(f'policy_loss{agent.id}',
                          stats['policy_loss'],
                          step=n_episodes)
            ex.log_scalar(f'value_loss{agent.id}',
                          stats['value_loss'],
                          step=n_episodes)
            ex.log_scalar(f'loss{agent.id}', stats['loss'], step=n_episodes)
            ex.log_scalar(f'entropy{agent.id}',
                          stats['entropy'],
                          step=n_episodes)
            ex.log_scalar(f'grads{agent.id}',
                          stats["grads_l2"],
                          step=n_episodes)
            ex.log_scalar(f'grads_var{agent.id}',
                          stats["grads_var"],
                          step=n_episodes)

            if step_idx % 50 == 0:  #args.sync_interval == 0:
                agent.sync_models()
                print(f'sync at {step_idx * args.n_episodes_per_step}')

        s = f"Step {step_idx}: "
        s += f"Average length: {epi_len/args.n_episodes_per_step:5.2f} - "
        s += f"win ratio: {nwins/args.n_episodes_per_step:4.3f} - "
        print(s)
        epi_len, nwins = 0, 0

        #_ = generate_episode(env, render=True)

    from os.path import expanduser
    home = expanduser("~")
    #for agent in training_agents:
    #    agent.save(home+args.path+f'RUN_{get_run_id()}_AGENT{agent.id}.p')
    torch.save(models["model"].state_dict(),
               home + args.path + f'RUN_{get_run_id()}_MODEL.torch')