def run():
    env = envs.make(args.env_name)

    flag_is_train = args.flag_is_train  # flag_is_train = 1, 一个训练,一个使用; flag_is_train = 0, 两个都是在使用(根据train_agent状态决定输出谁的信息---flag_train_blue)
    flag_focus_blue = args.flag_focus_blue  # flag_focus_blue = 1 时训练agent_blue; flag_train_blue = 0 时训练agent_red

    if flag_focus_blue:
        train_agent_name = 'blue'
        red_agent = DQN(env.state_dim,
                        env.action_dim,
                        is_train=False,
                        scope='red')
        blue_agent = DQN(env.state_dim,
                         env.action_dim,
                         is_train=flag_is_train,
                         scope='blue')
        alloc.check_scheme(blue_agent.is_train, red_agent.is_train,
                           train_agent_name)
        run_AirCombat_selfPlay(env, blue_agent, red_agent, train_agent_name)
    else:
        train_agent_name = 'red'
        blue_agent = DQN(env.state_dim,
                         env.action_dim,
                         is_train=False,
                         scope='blue')
        red_agent = DQN(env.state_dim,
                        env.action_dim,
                        is_train=flag_is_train,
                        scope='red')
        alloc.check_scheme(blue_agent.is_train, red_agent.is_train,
                           train_agent_name)
        run_AirCombat_selfPlay(env, red_agent, blue_agent, train_agent_name)
 def __init__(self, n_actions, epsilon=1.0):
     # TODO: n_actions is NOT too big: only 11, but dqn and drqn has different values
     # TODO: attack, move left, move right, aml, amr, tl, tr, f, b, af, ab
     # TODO: hard-code weapon change strategy. F**k!
     self._dqn = DQN('deathmatch', n_actions,
                     epsilon)  # TODO: shouldn't be 2 ** n_actions
     self._drqn = DRQN('deathmatch', n_actions, epsilon)
Exemple #3
0
def creat_n_agent(unit_list, is_train, scope, sess):
    agent_list = []
    for unit in unit_list:
        new_agent = DQN(unit.state_dim, unit.action_dim,
                        scope + str(unit.number), sess)
        agent.append(new_agent)
    return agent_list
Exemple #4
0
    def get_initial_policy_net(LINEAR_INPUT_SCALAR=8, KERNEL=5):
        env = gym.make('gvgai-zelda-lvl0-v0')
        init_screen = get_screen(env, device)

        _, _, screen_height, screen_width = init_screen.shape
        n_actions = env.action_space.n

        init_model = [
            screen_height, screen_width, LINEAR_INPUT_SCALAR, KERNEL, n_actions
        ]
        policy_net = DQN(*init_model).to(device)
        return policy_net, init_model
Exemple #5
0
def get_initial_policy_net(level='gvgai-zelda-lvl0-v0',
                           LINEAR_INPUT_SCALAR=8,
                           KERNEL=5,
                           env_maker=None):
    if env_maker:
        env = env_maker(level)
    else:
        import gym_gvgai
        env = gym.make(level)

    device = find_device()
    init_screen = get_screen(env, device)

    _, _, screen_height, screen_width = init_screen.shape
    n_actions = env.action_space.n

    init_model = [
        screen_height, screen_width, LINEAR_INPUT_SCALAR, KERNEL, n_actions
    ]
    policy_net = DQN(*init_model).to(device)
    return policy_net, init_model
Exemple #6
0
    def train(self):
        config = self.config
        torch.manual_seed(config["seed"])

        env = UnityEnvWrapper(UnityEnvironment(file_name=config["env_path"]))
        env.reset()

        agent = DQN(config, env.state_dim, env.action_dim)

        # Epsilon parameters
        eps_start = config["eps_start"]
        eps_end = config["eps_end"]
        eps_decay = config["eps_decay"]

        scores = []
        scores_window = deque(maxlen=100)
        eps = eps_start
        time_start = time.time()
        for i_ep in range(1, config["n_episodes"] + 1):
            state = env.reset()
            score = 0
            while True:
                action = agent.act(state, eps)
                next_state, reward, done = env.step(action)
                agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)
            scores.append(score)
            eps = max(eps_end, eps_decay * eps)
            mean_score = np.mean(scores_window)
            print(f'\rEpisode {i_ep}\tAverage Score: {mean_score:.2f}', end="")
            if i_ep % 100 == 0:
                print(f'\rEpisode {i_ep}\tAverage Score: {mean_score:.2f}')
                agent.save("saved_models/model")

        time_elapsed = time.time() - time_start()
        print(f"Traing took: {time_elapsed // 3600:4.3d} hours")

        return scores
Exemple #7
0
def runner(env_name, memory_bank_size, batch_size, gamma, learning_rate,
           epsilon, epsilon_min, loss, n_episodes, ma_threshold, args):
    # Initialize environment
    env = gym.make(env_name)
    nS = env.observation_space.shape[0]
    nA = env.env.action_space.n

    # Initialize memory bank and model
    memory_bank = MemoryBank(memory_bank_size)

    if args.model == 'dqn_plain':
        model = DQNPlain(nS, nA, [64], gamma, learning_rate, epsilon,
                         epsilon_min, loss)
    elif args.model == 'dqn':
        model = DQN(nS, nA, [64], gamma, learning_rate, epsilon, epsilon_min,
                    1000, loss)
    elif args.model == 'ddqn':
        model = DDQN(nS, nA, [64], gamma, learning_rate, epsilon, epsilon_min,
                     1000, loss)

    # Initialize logging variables
    reward_list = deque()
    current_index = 0
    train_log = deque()

    for episode in range(n_episodes):
        state = env.reset()
        done = False
        steps = 0
        total_reward = 0

        while not done:
            action, e = model.take_action(state, episode)
            new_state, reward, done, info = env.step(action)
            memory_bank.add(state, action, reward, new_state, done)

            state = new_state

            if current_index > memory_bank_size:
                # Get minibatch
                minibatch = memory_bank.get_mini_batch(batch_size)

                # Train on minibatch
                model.train_minibatch(minibatch)

            steps += 1
            total_reward += int(reward)
            current_index += 1

            if (args.render_env == 'y') and (episode % args.render_freq == 0):
                env.render()

        reward_list.append(total_reward)
        moving_average = np.mean(reward_list)
        if len(reward_list) > 100:
            reward_list.popleft()

        train_log.append((episode, steps, e, total_reward, moving_average))
        logger.info(
            'Ep: {} | Steps: {} | epsilon: {:.3f} | reward: {} | moving average: {:.2f}'
            .format(episode, steps, e, total_reward, moving_average))
        if moving_average > ma_threshold:
            break

    # Save log and model weights
    train_df = pd.DataFrame(data=list(train_log),
                            columns=[
                                'episode', 'steps', 'epsilon', 'total_reward',
                                'moving_average'
                            ])
    train_df.to_csv('./logs/{}_{}_log.csv'.format(env_name, args.model),
                    index=False)

    # Save memory bank and weights
    memory_bank.save_memory('./logs/{}_memory_bank'.format(env_name))
    model.save_model('./logs/{}_{}_weights'.format(env_name, args.model))
Exemple #8
0
    parser.add_argument('--task', type=str, default='pong')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--cpu', action='store_true')
    parser.add_argument('--evaluate', type=str, default=None)
    parser.add_argument('--resume', type=str, default=None, nargs=2)
    args = parser.parse_args()
    params = HPS[args.task]

    device = torch.device('cpu') if args.cpu else torch.device('cuda')

    env = make_env(params.env_name)
    obs_shape = env.observation_space.shape
    nb_actions = env.action_space.n

    if params.net_type == 'conv':
        net = DQN((params.frame_stack, *obs_shape), nb_actions)
    elif params.net_type == 'linear':
        net = DQNLinear(obs_shape, nb_actions)
    agent = DQNAgent(net=net,
                     nb_actions=nb_actions,
                     gamma=params.gamma,
                     unroll_steps=params.unroll_steps,
                     device=device)

    if args.evaluate:
        agent.net.load_state_dict(torch.load(args.evaluate))
        env = make_env(params.env_name, episodic=False)
        evaluate(agent, env, render=args.render)
        exit()

    if args.resume:
Exemple #9
0
        from pyglet.window import key

        def key_press(k, mod):
            global restart
            global a
            if k == key.R: restart = True
            if k == key.UP: a = 0
            if k == key.DOWN: a = 1
            if k == key.LEFT: a = 2
            if k == key.RIGHT: a = 3

        env.render()
        env.viewer.window.on_key_press = key_press
    else:
        size = (args.dim + 2) * args.zoom
        model = DQN(size, size, batch_norm=True)
        model.load_state_dict(torch.load(args.filename))
        policy = PurePolicy(model)
    try:
        while True:
            state = env.reset()
            total_reward = 0.0
            steps = 0
            restart = False
            while True:
                pyglet.clock.tick()
                if (policy is not None):
                    state_ten = tensorize(state)
                    a = policy.get(state_ten)
                state, r, done, info = env.step(a)
                total_reward += r
Exemple #10
0
        ep_scores.append(score)

    print("Scores: ", ep_scores)
    return np.mean(ep_scores)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", default="config.yml",
                        help="Path to the config file.")
    parser.add_argument("--n", type=int, default=10,
                        help="Number of times to evaluate model.")
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    config = load_config(args.config)

    path = "/Users/igor/Downloads/Banana.app"
    env = UnityEnvWrapper(UnityEnvironment(file_name=path))
    env.reset()

    config["device"] = "cpu"
    agent = DQN(config, state_size=env.state_dim, action_size=env.action_dim)
    agent.load("saved_models/model", "cpu")

    eval_score = evaluate(env, agent, n_episodes=args.n)
    print(f"Eval score: {eval_score:5.3f}")
lvl = 7

env = gym.make(f'{game}-lvl{lvl}-v0')
env.reset()

device = find_device()
init_screen = get_screen(env, device)
_, _, screen_height, screen_width = init_screen.shape
n_actions = env.action_space.n
LINEAR_INPUT_SCALAR = 8
KERNEL = 5
init_model = [
    screen_height, screen_width, LINEAR_INPUT_SCALAR, KERNEL, n_actions
]
win_factor = 100
model = DQN(*init_model)
model.load_state_dict(torch.load('saved_models/torch_model_0-1-1-1-1-1'))

current_screen = get_screen(env, device)
state = current_screen

stop_after = 1000

sum_score = 0
won = 0
key_found = 0

for lvl in range(7, 8):
    level_name = f'{game}-lvl{lvl}-v0'
    print(level_name)
    env = gym.make(level_name)
Exemple #12
0
def run():
    env = envs.make(args.env_name)

    blue_agent = DQN(env.state_dim, env.action_dim, is_train=1, scope='blue')
    red_agent = DQN(env.state_dim, env.action_dim, is_train=1, scope='red')
    run_NFSP(env, blue_agent, red_agent)