예제 #1
0
파일: imghash.py 프로젝트: sxhylkl/Atlas
def main(uri,database):
    client=MongoClient(uri)
    db=client[database]
    init(db,OFFSET,PATH)
    categories=db['category_info'].find({'machine_id':MACHINE_ID},{'name_en':1,'_id':0})
    for category in categories:
        scheduler(db,category=category['name_en'])
    client.close()
예제 #2
0
def main():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(map_name="DefeatZerglingsAndBanelings",
                        step_mul=step_mul,
                        visualize=True,
                        game_steps_per_episode=steps * step_mul) as env:

        checkpoint_path = 'models/deepq/checkpoint.pth.tar'
        dqn = DQN()
        dqn, saved_mean_reward = load_checkpoint(dqn, filename=checkpoint_path)
        while True:
            episode_rewards = [0.0]
            obs = env.reset()

            done = False
            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

            screen = player_relative
            obs, xy_per_marine = common.init(env, obs)

            group_id = 0
            reset = True
            obs, screen, player = common.select_marine(env, obs)
            # step_result = env.step(actions=[
            #     sc2_actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL])
            # ])

            while not done:

                obs, screen, player = common.select_marine(env, obs)
                action = dqn.choose_action(np.array(screen)[None])[0]
                reset = False
                rew = 0

                new_action = None

                obs, new_action = common.marine_action(env, obs, player,
                                                       action)
                army_count = env._obs[0].observation.player_common.army_count
                try:
                    if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[
                            "available_actions"]:
                        obs = env.step(actions=new_action)
                    else:
                        new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                        obs = env.step(actions=new_action)
                except Exception as e:
                    # print(e)
                    1  # Do nothing

                player_relative = obs[0].observation["screen"][
                    _PLAYER_RELATIVE]
                new_screen = player_relative

                rew += obs[0].reward

                done = obs[0].step_type == environment.StepType.LAST

                selected = obs[0].observation["screen"][_SELECTED]
                player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()

                if len(player_y) > 0:
                    player = [int(player_x.mean()), int(player_y.mean())]

                if len(player) == 2:

                    if player[0] > 32:
                        new_screen = common.shift(LEFT, player[0] - 32,
                                                  new_screen)
                    elif player[0] < 32:
                        new_screen = common.shift(RIGHT, 32 - player[0],
                                                  new_screen)

                    if player[1] > 32:
                        new_screen = common.shift(UP, player[1] - 32,
                                                  new_screen)
                    elif player[1] < 32:
                        new_screen = common.shift(DOWN, 32 - player[1],
                                                  new_screen)

                # Store transition in the replay buffer.
                screen = new_screen

                episode_rewards[-1] += rew
                reward = episode_rewards[-1]
                mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            print("Episode reward", mean_100ep_reward)
예제 #3
0
def worker(remote, map_name, nscripts, i):
    import sys
    from absl import flags
    FLAGS = flags.FLAGS
    try:
        FLAGS(sys.argv)
    except:
        pass

    with sc2_env.SC2Env(map_name=map_name,
                        step_mul=2,
                        screen_size_px=(32, 32),
                        minimap_size_px=(32, 32)) as env:
        available_actions = []
        result = None
        group_list = []
        xy_per_marine = {}
        while True:
            cmd, data = remote.recv()
            if cmd == 'step':
                reward = 0

                if len(group_list) == 0 or common.check_group_list(
                        env, result):
                    print("init group list")
                    result, xy_per_marine = common.init(env, result)
                    group_list = common.update_group_list(result)

                action1 = data[0][0]
                action2 = data[0][1]
                # func = actions.FUNCTIONS[action1[0]]
                # print("agent(",i," ) action : ", action1, " func : ", func)
                func = actions.FUNCTIONS[action2[0]]
                # print("agent(",i," ) action : ", action2, " func : ", func)

                result = env.step(actions=[action1])
                reward += result[0].reward
                done = result[0].step_type == environment.StepType.LAST

                move = True

                if len(action2[1]) == 2:
                    x, y = action2[1][1]
                    # print("x, y:", x, y)

                    # if x == 0 and y == 0:
                    #   move = False

                if (331 in available_actions and move and not done):
                    try:
                        result = env.step(actions=[action2])
                        reward += result[0].reward
                        done = result[0].step_type == environment.StepType.LAST
                    except Exception as e:
                        print("e :", e)

                ob = (result[0].observation["screen"]
                      [_PLAYER_RELATIVE:_PLAYER_RELATIVE + 1] == 3).astype(int)

                #  (1, 32, 32)
                selected = result[0].observation["screen"][
                    _SELECTED:_SELECTED + 1]  #  (1, 32, 32)
                # extra = np.zeros((1, 32, 32))
                control_groups = result[0].observation["control_groups"]
                army_count = env._obs[0].observation.player_common.army_count

                available_actions = result[0].observation["available_actions"]
                info = result[0].observation["available_actions"]
                if done:
                    result = env.reset()

                    if len(group_list) == 0 or common.check_group_list(
                            env, result):
                        # print("init group list")
                        result, xy_per_marine = common.init(env, result)
                        group_list = common.update_group_list(result)

                    info = result[0].observation["available_actions"]

                if len(action1[1]) == 2:

                    group_id = action1[1][1][0]

                    player_y, player_x = (result[0].observation["screen"]
                                          [_SELECTED] == 1).nonzero()

                    if len(player_x) > 0:
                        if (group_id == 1):
                            xy_per_marine["1"] = [
                                int(player_x.mean()),
                                int(player_y.mean())
                            ]
                        else:
                            xy_per_marine["0"] = [
                                int(player_x.mean()),
                                int(player_y.mean())
                            ]

                remote.send((ob, reward, done, info, army_count,
                             control_groups, selected, xy_per_marine))

            elif cmd == 'reset':
                result = env.reset()
                reward = 0

                if len(group_list) == 0 or common.check_group_list(
                        env, result):
                    # print("init group list")
                    result, xy_per_marine = common.init(env, result)
                    group_list = common.update_group_list(result)

                reward += result[0].reward
                ob = (result[0].observation["screen"]
                      [_PLAYER_RELATIVE:_PLAYER_RELATIVE + 1] == 3).astype(int)
                selected = result[0].observation["screen"][
                    _SELECTED:_SELECTED + 1]  #  (1, 32, 32)
                # extra = np.zeros((1, 32, 32))
                control_groups = result[0].observation["control_groups"]
                army_count = env._obs[0].observation.player_common.army_count

                done = result[0].step_type == environment.StepType.LAST
                info = result[0].observation["available_actions"]
                available_actions = result[0].observation["available_actions"]
                remote.send((ob, reward, done, info, army_count,
                             control_groups, selected, xy_per_marine))
            elif cmd == 'close':
                remote.close()
                break
            elif cmd == 'get_spaces':
                remote.send((env.action_spec().functions[data], ""))
            elif cmd == "action_spec":
                remote.send((env.action_spec().functions[data]))
            else:
                raise NotImplementedError
예제 #4
0
def learn(env,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16):
    torch.set_num_threads(num_cpu)
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(
            buffer_size, alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(
            prioritized_replay_beta_iters,
            initial_p=prioritized_replay_beta0,
            final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    exploration = LinearSchedule(
        schedule_timesteps=int(exploration_fraction * max_timesteps),
        initial_p=1.0,
        final_p=exploration_final_eps)
    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.reset()
    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative

    obs, xy_per_marine = common.init(env, obs)

    group_id = 0
    reset = True
    dqn = DQN(num_actions, lr, cuda)

    print('\nCollecting experience...')
    checkpoint_path = 'models/deepq/checkpoint.pth.tar'
    if os.path.exists(checkpoint_path):
        dqn, saved_mean_reward = load_checkpoint(dqn, cuda, filename=checkpoint_path)
    for t in range(max_timesteps):
        # Take action and update exploration to the newest value
        # custom process for DefeatZerglingsAndBanelings
        obs, screen, player = common.select_marine(env, obs)
        # action = act(
        #     np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
        action = dqn.choose_action(np.array(screen)[None])
        reset = False
        rew = 0
        new_action = None
        obs, new_action = common.marine_action(env, obs, player, action)
        army_count = env._obs[0].observation.player_common.army_count
        try:
            if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
                obs = env.step(actions=new_action)
            else:
                new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                obs = env.step(actions=new_action)
        except Exception as e:
            # print(e)
            1  # Do nothing
        player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
        new_screen = player_relative
        rew += obs[0].reward
        done = obs[0].step_type == environment.StepType.LAST
        selected = obs[0].observation["screen"][_SELECTED]
        player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()
        if len(player_y) > 0:
            player = [int(player_x.mean()), int(player_y.mean())]
        if len(player) == 2:
            if player[0] > 32:
                new_screen = common.shift(LEFT, player[0] - 32, new_screen)
            elif player[0] < 32:
                new_screen = common.shift(RIGHT, 32 - player[0],
                                          new_screen)
            if player[1] > 32:
                new_screen = common.shift(UP, player[1] - 32, new_screen)
            elif player[1] < 32:
                new_screen = common.shift(DOWN, 32 - player[1], new_screen)
        # Store transition in the replay buffer.
        replay_buffer.add(screen, action, rew, new_screen, float(done))
        screen = new_screen
        episode_rewards[-1] += rew
        reward = episode_rewards[-1]
        if done:
            print("Episode Reward : %s" % episode_rewards[-1])
            obs = env.reset()
            player_relative = obs[0].observation["screen"][
                _PLAYER_RELATIVE]
            screen = player_relative
            group_list = common.init(env, obs)
            # Select all marines first
            # env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
            episode_rewards.append(0.0)
            reset = True

        if t > learning_starts and t % train_freq == 0:
            # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
            if prioritized_replay:
                experience = replay_buffer.sample(
                    batch_size, beta=beta_schedule.value(t))
                (obses_t, actions, rewards, obses_tp1, dones, weights,
                 batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                    batch_size)
                weights, batch_idxes = np.ones_like(rewards), None

            td_errors = dqn.learn(obses_t, actions, rewards, obses_tp1, gamma, batch_size)

            if prioritized_replay:
                new_priorities = np.abs(td_errors) + prioritized_replay_eps
                replay_buffer.update_priorities(batch_idxes,
                                                new_priorities)

        if t > learning_starts and t % target_network_update_freq == 0:
            # Update target network periodically.
            dqn.update_target()

        mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
        num_episodes = len(episode_rewards)
        if done and print_freq is not None and len(
                episode_rewards) % print_freq == 0:
            logger.record_tabular("steps", t)
            logger.record_tabular("episodes", num_episodes)
            logger.record_tabular("reward", reward)
            logger.record_tabular("mean 100 episode reward",
                                  mean_100ep_reward)
            logger.record_tabular("% time spent exploring",
                                  int(100 * exploration.value(t)))
            logger.dump_tabular()

        if (checkpoint_freq is not None and t > learning_starts
                and num_episodes > 100 and t % checkpoint_freq == 0):
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                if print_freq is not None:
                    logger.log(
                        "Saving model due to mean reward increase: {} -> {}".format(
                            saved_mean_reward,
                            mean_100ep_reward))
                save_checkpoint({
                    'epoch': t + 1,
                    'state_dict': dqn.save_state_dict(),
                    'best_accuracy': mean_100ep_reward
                }, checkpoint_path)
                saved_mean_reward = mean_100ep_reward
예제 #5
0
def learn(env,
          q_func,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None,
          demo_replay=[]):
    """Train a deepq model.

Parameters
-------
env: pysc2.env.SC2Env
    environment to train on
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float
    learning rate for adam optimizer
max_timesteps: int
    number of env steps to optimizer for
buffer_size: int
    size of the replay buffer
exploration_fraction: float
    fraction of entire training period over which the exploration rate is annealed
exploration_final_eps: float
    final value of random action probability
train_freq: int
    update the model every `train_freq` steps.
    set to None to disable printing
batch_size: int
    size of a batched sampled from replay buffer for training
print_freq: int
    how often to print out training progress
    set to None to disable printing
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
learning_starts: int
    how many steps of the model to collect transitions for before learning starts
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay: True
    if True prioritized replay buffer will be used.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None

    obs = env.reset()
    # Select all marines first

    player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

    screen = player_relative

    obs, xy_per_marine = common.init(env, obs)

    group_id = 0
    reset = True
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            # custom process for DefeatZerglingsAndBanelings

            obs, screen, player = common.select_marine(env, obs)

            action = act(np.array(screen)[None],
                         update_eps=update_eps,
                         **kwargs)[0]
            reset = False
            rew = 0

            new_action = None

            obs, new_action = common.marine_action(env, obs, player, action)
            army_count = env._obs[0].observation.player_common.army_count

            try:
                if army_count > 0 and _ATTACK_SCREEN in obs[0].observation[
                        "available_actions"]:
                    obs = env.step(actions=new_action)
                else:
                    new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                    obs = env.step(actions=new_action)
            except Exception as e:
                #print(e)
                1  # Do nothing

            player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
            new_screen = player_relative

            rew += obs[0].reward

            done = obs[0].step_type == environment.StepType.LAST

            selected = obs[0].observation["screen"][_SELECTED]
            player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()

            if (len(player_y) > 0):
                player = [int(player_x.mean()), int(player_y.mean())]

            if (len(player) == 2):

                if (player[0] > 32):
                    new_screen = common.shift(LEFT, player[0] - 32, new_screen)
                elif (player[0] < 32):
                    new_screen = common.shift(RIGHT, 32 - player[0],
                                              new_screen)

                if (player[1] > 32):
                    new_screen = common.shift(UP, player[1] - 32, new_screen)
                elif (player[1] < 32):
                    new_screen = common.shift(DOWN, 32 - player[1], new_screen)

            # Store transition in the replay buffer.
            replay_buffer.add(screen, action, rew, new_screen, float(done))
            screen = new_screen

            episode_rewards[-1] += rew
            reward = episode_rewards[-1]

            if done:
                print("Episode Reward : %s" % episode_rewards[-1])
                obs = env.reset()
                player_relative = obs[0].observation["screen"][
                    _PLAYER_RELATIVE]

                screen = player_relative

                group_list = common.init(env, obs)

                # Select all marines first
                #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
                episode_rewards.append(0.0)

                reset = True

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("reward", reward)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act)
예제 #6
0
def worker(remote, map_name, i):

    with sc2_env.SC2Env(map_name=map_name,
                        step_mul=1,
                        screen_size_px=(32, 32),
                        minimap_size_px=(32, 32)) as env:
        available_actions = None
        result = None
        group_list = []
        xy_per_marine = {}
        while True:
            cmd, data = remote.recv()
            if cmd == 'step':
                # if(common.check_group_list(env, result)):
                #   result, xy_per_marine = common.init(env,result)

                reward = 0

                if (len(group_list) == 0
                        or common.check_group_list(env, result)):
                    print("init group list")
                    result, xy_per_marine = common.init(env, result)
                    group_list = common.update_group_list(result)

                action1 = data[0][0]
                action2 = data[0][1]
                func = actions.FUNCTIONS[action1[0]]
                # print("agent(",i," ) action : ", action1, " func : ", func)
                func = actions.FUNCTIONS[action2[0]]
                #print("agent(",i," ) action : ", action2, " func : ", func, "xy :", action2[1][1])
                x, y = action2[1][1]
                move = True
                if (x == 0 and y == 0):
                    move = False
                result = env.step(actions=[action1])
                reward += result[0].reward
                done = result[0].step_type == environment.StepType.LAST
                if (331 in available_actions and move and not done):
                    try:
                        result = env.step(actions=[action2])
                        reward += result[0].reward
                        done = result[0].step_type == environment.StepType.LAST
                    except Exception as e:
                        print("e :", e)

                ob = (result[0].observation["screen"]
                      [_PLAYER_RELATIVE:_PLAYER_RELATIVE + 1] == 3).astype(
                          int)  #  (1, 32, 32)
                selected = result[0].observation["screen"][
                    _SELECTED:_SELECTED + 1]  #  (1, 32, 32)
                # extra = np.zeros((1, 32, 32))
                control_groups = result[0].observation["control_groups"]
                army_count = env._obs[0].observation.player_common.army_count

                # extra[0,0,0] = army_count
                # for id, group in enumerate(control_groups):
                #   control_group_id = id
                #   unit_id = group[0]
                #   count = group[1]
                #   #print("control_group_id :", control_group_id, " unit_id :", unit_id, " count :", count)
                #   extra[0,1, control_group_id] = unit_id
                #   extra[0,2, control_group_id] = count
                #ob = np.append(ob, selected, axis=0) #  (2, 32, 32)
                #ob = np.append(ob, extra, axis=0) # (3, 32, 32)

                available_actions = result[0].observation["available_actions"]
                info = result[0].observation["available_actions"]
                if done:
                    result = env.reset()

                    if (len(group_list) == 0
                            or common.check_group_list(env, result)):
                        print("init group list")
                        result, xy_per_marine = common.init(env, result)
                        group_list = common.update_group_list(result)

                    # ob = result[0].observation["screen"]
                    # reward = result[0].reward
                    # done = result[0].step_type == environment.StepType.LAST
                    info = result[0].observation["available_actions"]

                group_id = action1[1][1][0]
                # print("group_id:", group_id)

                player_y, player_x = (
                    result[0].observation["screen"][_SELECTED] == 1).nonzero()

                if (len(player_x) > 0):
                    if (group_id == 1):
                        xy_per_marine["1"] = [
                            int(player_x.mean()),
                            int(player_y.mean())
                        ]
                    else:
                        xy_per_marine["0"] = [
                            int(player_x.mean()),
                            int(player_y.mean())
                        ]

                remote.send((ob, reward, done, info, army_count,
                             control_groups, selected, xy_per_marine))
            elif cmd == 'reset':
                result = env.reset()
                reward = 0

                if (len(group_list) == 0
                        or common.check_group_list(env, result)):
                    print("init group list")
                    result, xy_per_marine = common.init(env, result)
                    group_list = common.update_group_list(result)

                reward += result[0].reward
                ob = (result[0].observation["screen"]
                      [_PLAYER_RELATIVE:_PLAYER_RELATIVE + 1] == 3).astype(int)
                selected = result[0].observation["screen"][
                    _SELECTED:_SELECTED + 1]  #  (1, 32, 32)
                # extra = np.zeros((1, 32, 32))
                control_groups = result[0].observation["control_groups"]
                army_count = env._obs[0].observation.player_common.army_count
                # extra[0,0,0] = army_count
                # for id, group in enumerate(control_groups):
                #   control_group_id = id
                #   unit_id = group[0]
                #   count = group[1]
                #   #print("control_group_id :", control_group_id, " unit_id :", unit_id, " count :", count)
                #   extra[0,1, control_group_id] = unit_id
                #   extra[0,2, control_group_id] = count
                # ob = np.append(ob, selected, axis=0) #  (2, 32, 32)
                # ob = np.append(ob, extra, axis=0) # (3, 32, 32)

                done = result[0].step_type == environment.StepType.LAST
                info = result[0].observation["available_actions"]
                available_actions = result[0].observation["available_actions"]
                remote.send((ob, reward, done, info, army_count,
                             control_groups, selected, xy_per_marine))
            elif cmd == 'close':
                remote.close()
                break
            elif cmd == 'get_spaces':
                remote.send((env.action_spec().functions[data], ""))
            elif cmd == "action_spec":
                remote.send((env.action_spec().functions[data]))
            else:
                raise NotImplementedError
예제 #7
0
        if status != not_over:  #this means the game has over not matter win or lose.
            drawRes(matrix, status)
            pygame.display.update()
            for event in pygame.event.get():
                if event.type == pygame.locals.QUIT or (
                        event.type == KEYDOWN and event.key == K_ESCAPE):
                    exitGame()


def main():
    '''
    this is the main function, run the function to run the game,and this function can be invoked by anyone.
    :return:
    '''
    global DISPLAYSURF
    matrix = importGameData(readMatrix)
    if not matrix:
        matrix = getNewMatrix(
            GRID_LEN)  #the matrix to represent the num on the panel
        random2(matrix)
        random2(matrix)
    pygame.init()
    DISPLAYSURF = pygame.display.set_mode((SIZE, SIZE))
    pygame.display.set_caption('2048')

    runGame(matrix)


if __name__ == '__main__':
    init()
    main()
def main():
    FLAGS(sys.argv)
    with sc2_env.SC2Env(
            map_name=FLAGS.map_name,
            step_mul=FLAGS.step_mul,
            visualize=FLAGS.visualize,
            game_steps_per_episode=FLAGS.episode_steps * FLAGS.step_mul) as env:

        model = deepq.models.cnn_to_mlp(
            convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1), (64, 3, 1), (64, 3, 1), (32, 3, 1)],
            hiddens=[256],
            dueling=True
        )

        def make_obs_ph(name):
            return U.BatchInput((64, 64), name=name)

        act_params = {
            'make_obs_ph': make_obs_ph,
            'q_func': model,
            'num_actions': FLAGS.num_actions,
        }

        act = deep_Defeat_zerglings.load(
            FLAGS.trained_model, act_params=act_params)

        while True:
            rew = 0
            old_num = 0
            done = False
            episode_rew = 0
            Action_Choose = False
            episode_rewards = [0.0]
            saved_mean_reward = None

            obs = env.reset()
            obs, xy_per_marine = common.init(env, obs)

            while True:
                Action_Choose = not (Action_Choose)
                if Action_Choose == True:
                    # the first action
                    obs, screen, player = common.select_marine(env, obs)
                else:
                    # the second action
                    action = act(
                        np.array(screen)[None])[0]
                    action = common.check_action(obs, action)
                    obs, new_action = common.marine_action(env, obs, player, action)
                    army_count = env._obs[0].observation.player_common.army_count

                    try:
                        if army_count > 0 and (_MOVE_SCREEN in obs[0].observation["available_actions"]):
                            obs = env.step(actions=new_action)
                        else:
                            new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                            obs = env.step(actions=new_action)
                    except Exception as e:
                        print(new_action)
                        print(e)
                        new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                        obs = env.step(actions=new_action)

                rew = obs[0].reward
                done = obs[0].step_type == environment.StepType.LAST
                episode_rewards[-1] += rew

                if done:
                    obs = env.reset()
                    Action_Choose = False
                    group_list = common.init(env, obs)
                    episode_rewards.append(0.0)

                # test for me
                num_episodes = len(episode_rewards)
                if (num_episodes > 102):
                    mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
                else:
                    mean_100ep_reward = round(np.mean(episode_rewards), 1)

                if num_episodes > old_num:
                    old_num = num_episodes
                    if old_num>2:
                        logger.record_tabular("reward now", episode_rewards[-2])
                    logger.record_tabular("the number of episode", num_episodes)
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.dump_tabular()
                    print("the number of episode", num_episodes)
                    print("mean 100 episode reward",mean_100ep_reward)
예제 #9
0
def learn(env,
          q_func,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=100,
          print_freq=15,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None,
          demo_replay=[]):
    """Train a deepq model.
Parameters
-------
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
    # Create all the functions necessary to train the model

    sess = TU.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput((64, 64), name=name)

    act, train, update_target, debug = deepq.build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=num_actions,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': num_actions,
    }

    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    TU.initialize()
    update_target()

    group_id = 0
    old_num = 0
    reset = True
    Action_Choose = False
    player = []
    episode_rewards = [0.0]
    saved_mean_reward = None
    marine_record = {}

    obs = env.reset()
    screen = obs[0].observation["screen"][_UNIT_TYPE]
    obs, xy_per_marine = common.init(env, obs)

    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")

        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            kwargs = {}
            if not param_noise:
                update_eps = exploration.value(t)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                if param_noise_threshold >= 0.:
                    update_param_noise_threshold = param_noise_threshold
                else:
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = -np.log(
                        1. - exploration.value(t) +
                        exploration.value(t) / float(num_actions))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            # custom process for DefeatZerglingsAndBanelings
            reset = False
            Action_Choose = not (Action_Choose)

            if Action_Choose == True:
                #the first action
                obs, screen, group_id, player = common.select_marine(env, obs)
                marine_record = common.run_record(marine_record, obs)

            else:
                # the second action
                action = act(np.array(screen)[None],
                             update_eps=update_eps,
                             **kwargs)[0]
                action = common.check_action(obs, action)
                new_action = None

                obs, new_action, marine_record = common.marine_action(
                    env, obs, group_id, player, action, marine_record)
                army_count = env._obs[0].observation.player_common.army_count

                try:
                    if army_count > 0 and (
                            _MOVE_SCREEN
                            in obs[0].observation["available_actions"]):
                        obs = env.step(actions=new_action)
                    else:
                        new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                        obs = env.step(actions=new_action)
                except Exception as e:
                    print(new_action)
                    print(e)
                    new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
                    obs = env.step(actions=new_action)
                # get the new screen in action 2
                player_y, player_x = np.nonzero(
                    obs[0].observation["screen"][_SELECTED] == 1)
                new_screen = obs[0].observation["screen"][_UNIT_TYPE]
                for i in range(len(player_y)):
                    new_screen[player_y[i]][player_x[i]] = 49

            #update every step
            rew = obs[0].reward
            done = obs[0].step_type == environment.StepType.LAST
            episode_rewards[-1] += rew
            reward = episode_rewards[-1]

            if Action_Choose == False:  # only store the screen after the action is done
                replay_buffer.add(screen, action, rew, new_screen, float(done))
                mirror_new_screen = common._map_mirror(new_screen)
                mirror_screen = common._map_mirror(screen)
                replay_buffer.add(mirror_screen, action, rew,
                                  mirror_new_screen, float(done))

            if done:
                obs = env.reset()
                Action_Choose = False
                group_list = common.init(env, obs)
                episode_rewards.append(0.0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  weights)
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            num_episodes = len(episode_rewards)
            #test for me
            if num_episodes > old_num:
                old_num = num_episodes
                print("now the episode is {}".format(num_episodes))
            #test for me
            if (num_episodes > 102):
                mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            else:
                mean_100ep_reward = round(np.mean(episode_rewards), 1)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                print("get the log")
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("reward", reward)
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(t)))
                logger.dump_tabular()

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    if print_freq is not None:
                        logger.log(
                            "Saving model due to mean reward increase: {} -> {}"
                            .format(saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            if print_freq is not None:
                logger.log("Restored model with mean reward: {}".format(
                    saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act)
예제 #10
0
def learn(env,
          q_func,
          num_actions=3,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          param_noise=False,
          param_noise_threshold=0.05,
          callback=None,
          demo_replay=[]):
  """Train a deepq model.

Parameters
-------
env: pysc2.env.SC2Env
    environment to train on
q_func: (tf.Variable, int, str, bool) -> tf.Variable
    the model that takes the following inputs:
        observation_in: object
            the output of observation placeholder
        num_actions: int
            number of actions
        scope: str
        reuse: bool
            should be passed to outer variable scope
    and returns a tensor of shape (batch_size, num_actions) with values of every action.
lr: float
    learning rate for adam optimizer
max_timesteps: int
    number of env steps to optimizer for
buffer_size: int
    size of the replay buffer
exploration_fraction: float
    fraction of entire training period over which the exploration rate is annealed
exploration_final_eps: float
    final value of random action probability
train_freq: int
    update the model every `train_freq` steps.
    set to None to disable printing
batch_size: int
    size of a batched sampled from replay buffer for training
print_freq: int
    how often to print out training progress
    set to None to disable printing
checkpoint_freq: int
    how often to save the model. This is so that the best version is restored
    at the end of the training. If you do not wish to restore the best version at
    the end of the training set this variable to None.
learning_starts: int
    how many steps of the model to collect transitions for before learning starts
gamma: float
    discount factor
target_network_update_freq: int
    update the target network every `target_network_update_freq` steps.
prioritized_replay: True
    if True prioritized replay buffer will be used.
prioritized_replay_alpha: float
    alpha parameter for prioritized replay buffer
prioritized_replay_beta0: float
    initial value of beta for prioritized replay buffer
prioritized_replay_beta_iters: int
    number of iterations over which beta will be annealed from initial value
    to 1.0. If set to None equals to max_timesteps.
prioritized_replay_eps: float
    epsilon to add to the TD errors when updating priorities.
num_cpu: int
    number of cpus to use for training
callback: (locals, globals) -> None
    function called at every steps with state of the algorithm.
    If callback returns true training stops.

Returns
-------
act: ActWrapper
    Wrapper over act function. Adds ability to save it and load it.
    See header of baselines/deepq/categorical.py for details on the act function.
"""
  # Create all the functions necessary to train the model

  sess = U.make_session(num_cpu=num_cpu)
  sess.__enter__()

  def make_obs_ph(name):
    return U.BatchInput((64, 64), name=name)

  act, train, update_target, debug = deepq.build_train(
    make_obs_ph=make_obs_ph,
    q_func=q_func,
    num_actions=num_actions,
    optimizer=tf.train.AdamOptimizer(learning_rate=lr),
    gamma=gamma,
    grad_norm_clipping=10)
  act_params = {
    'make_obs_ph': make_obs_ph,
    'q_func': q_func,
    'num_actions': num_actions,
  }

  # Create the replay buffer
  if prioritized_replay:
    replay_buffer = PrioritizedReplayBuffer(
      buffer_size, alpha=prioritized_replay_alpha)
    if prioritized_replay_beta_iters is None:
      prioritized_replay_beta_iters = max_timesteps
    beta_schedule = LinearSchedule(
      prioritized_replay_beta_iters,
      initial_p=prioritized_replay_beta0,
      final_p=1.0)
  else:
    replay_buffer = ReplayBuffer(buffer_size)
    beta_schedule = None
  # Create the schedule for exploration starting from 1.
  exploration = LinearSchedule(
    schedule_timesteps=int(exploration_fraction * max_timesteps),
    initial_p=1.0,
    final_p=exploration_final_eps)

  # Initialize the parameters and copy them to the target network.
  U.initialize()
  update_target()

  episode_rewards = [0.0]
  saved_mean_reward = None

  obs = env.reset()
  # Select all marines first

  player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]

  screen = player_relative

  obs, xy_per_marine = common.init(env, obs)

  group_id = 0
  reset = True
  with tempfile.TemporaryDirectory() as td:
    model_saved = False
    model_file = os.path.join(td, "model")

    for t in range(max_timesteps):
      if callback is not None:
        if callback(locals(), globals()):
          break
      # Take action and update exploration to the newest value
      kwargs = {}
      if not param_noise:
        update_eps = exploration.value(t)
        update_param_noise_threshold = 0.
      else:
        update_eps = 0.
        if param_noise_threshold >= 0.:
          update_param_noise_threshold = param_noise_threshold
        else:
          # Compute the threshold such that the KL divergence between perturbed and non-perturbed
          # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
          # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
          # for detailed explanation.
          update_param_noise_threshold = -np.log(
            1. - exploration.value(t) +
            exploration.value(t) / float(num_actions))
        kwargs['reset'] = reset
        kwargs[
          'update_param_noise_threshold'] = update_param_noise_threshold
        kwargs['update_param_noise_scale'] = True

      # custom process for DefeatZerglingsAndBanelings

      obs, screen, player = common.select_marine(env, obs)

      action = act(
        np.array(screen)[None], update_eps=update_eps, **kwargs)[0]
      reset = False
      rew = 0

      new_action = None

      obs, new_action = common.marine_action(env, obs, player, action)
      army_count = env._obs[0].observation.player_common.army_count

      try:
        if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]:
          obs = env.step(actions=new_action)
        else:
          new_action = [sc2_actions.FunctionCall(_NO_OP, [])]
          obs = env.step(actions=new_action)
      except Exception as e:
        #print(e)
        1  # Do nothing

      player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE]
      new_screen = player_relative

      rew += obs[0].reward

      done = obs[0].step_type == environment.StepType.LAST

      selected = obs[0].observation["screen"][_SELECTED]
      player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero()

      if (len(player_y) > 0):
        player = [int(player_x.mean()), int(player_y.mean())]

      if (len(player) == 2):

        if (player[0] > 32):
          new_screen = common.shift(LEFT, player[0] - 32, new_screen)
        elif (player[0] < 32):
          new_screen = common.shift(RIGHT, 32 - player[0],
                                    new_screen)

        if (player[1] > 32):
          new_screen = common.shift(UP, player[1] - 32, new_screen)
        elif (player[1] < 32):
          new_screen = common.shift(DOWN, 32 - player[1], new_screen)

      # Store transition in the replay buffer.
      replay_buffer.add(screen, action, rew, new_screen, float(done))
      screen = new_screen

      episode_rewards[-1] += rew
      reward = episode_rewards[-1]

      if done:
        print("Episode Reward : %s" % episode_rewards[-1])
        obs = env.reset()
        player_relative = obs[0].observation["screen"][
          _PLAYER_RELATIVE]

        screen = player_relative

        group_list = common.init(env, obs)

        # Select all marines first
        #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])])
        episode_rewards.append(0.0)

        reset = True

      if t > learning_starts and t % train_freq == 0:
        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
        if prioritized_replay:
          experience = replay_buffer.sample(
            batch_size, beta=beta_schedule.value(t))
          (obses_t, actions, rewards, obses_tp1, dones, weights,
           batch_idxes) = experience
        else:
          obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
            batch_size)
          weights, batch_idxes = np.ones_like(rewards), None
        td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                          weights)
        if prioritized_replay:
          new_priorities = np.abs(td_errors) + prioritized_replay_eps
          replay_buffer.update_priorities(batch_idxes,
                                          new_priorities)

      if t > learning_starts and t % target_network_update_freq == 0:
        # Update target network periodically.
        update_target()

      mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
      num_episodes = len(episode_rewards)
      if done and print_freq is not None and len(
          episode_rewards) % print_freq == 0:
        logger.record_tabular("steps", t)
        logger.record_tabular("episodes", num_episodes)
        logger.record_tabular("reward", reward)
        logger.record_tabular("mean 100 episode reward",
                              mean_100ep_reward)
        logger.record_tabular("% time spent exploring",
                              int(100 * exploration.value(t)))
        logger.dump_tabular()

      if (checkpoint_freq is not None and t > learning_starts
          and num_episodes > 100 and t % checkpoint_freq == 0):
        if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
          if print_freq is not None:
            logger.log(
              "Saving model due to mean reward increase: {} -> {}".
                format(saved_mean_reward, mean_100ep_reward))
          U.save_state(model_file)
          model_saved = True
          saved_mean_reward = mean_100ep_reward
    if model_saved:
      if print_freq is not None:
        logger.log("Restored model with mean reward: {}".format(
          saved_mean_reward))
      U.load_state(model_file)

  return ActWrapper(act)
예제 #11
0
def worker(remote, map_name, nscripts, i):

  with sc2_env.SC2Env(
      map_name=map_name,
      step_mul=2,
      screen_size_px=(32, 32),
      minimap_size_px=(32, 32)) as env:
    available_actions = []
    result = None
    group_list = []
    xy_per_marine = {}
    while True:
      cmd, data = remote.recv()
      if cmd == 'step':
        reward = 0

        if len(group_list) == 0 or common.check_group_list(env, result):
          print("init group list")
          result, xy_per_marine = common.init(env, result)
          group_list = common.update_group_list(result)

        action1 = data[0][0]
        action2 = data[0][1]
        # func = actions.FUNCTIONS[action1[0]]
        # print("agent(",i," ) action : ", action1, " func : ", func)
        func = actions.FUNCTIONS[action2[0]]
        # print("agent(",i," ) action : ", action2, " func : ", func)


        result = env.step(actions=[action1])
        reward += result[0].reward
        done = result[0].step_type == environment.StepType.LAST

        move = True

        if len(action2[1]) == 2:
          x, y = action2[1][1]
          # print("x, y:", x, y)

          # if x == 0 and y == 0:
          #   move = False

        if (331 in available_actions and move and not done):
          try:
            result = env.step(actions=[action2])
            reward += result[0].reward
            done = result[0].step_type == environment.StepType.LAST
          except Exception as e:
            print("e :", e)

        ob = (result[0].observation["screen"][
            _PLAYER_RELATIVE:_PLAYER_RELATIVE + 1] == 3).astype(int)

        #  (1, 32, 32)
        selected = result[0].observation["screen"][
            _SELECTED:_SELECTED + 1]  #  (1, 32, 32)
        # extra = np.zeros((1, 32, 32))
        control_groups = result[0].observation["control_groups"]
        army_count = env._obs[0].observation.player_common.army_count

        available_actions = result[0].observation["available_actions"]
        info = result[0].observation["available_actions"]
        if done:
          result = env.reset()

          if len(group_list) == 0 or common.check_group_list(env, result):
            # print("init group list")
            result, xy_per_marine = common.init(env, result)
            group_list = common.update_group_list(result)

          info = result[0].observation["available_actions"]

        if len(action1[1]) == 2:

          group_id = action1[1][1][0]

          player_y, player_x = (result[0].observation["screen"][
              _SELECTED] == 1).nonzero()

          if len(player_x) > 0:
            if (group_id == 1):
              xy_per_marine["1"] = [int(player_x.mean()), int(player_y.mean())]
            else:
              xy_per_marine["0"] = [int(player_x.mean()), int(player_y.mean())]
          
        remote.send((ob, reward, done, info, army_count,
                     control_groups, selected, xy_per_marine))

      elif cmd == 'reset':
        result = env.reset()
        reward = 0

        if len(group_list) == 0 or common.check_group_list(env, result):
          # print("init group list")
          result, xy_per_marine = common.init(env, result)
          group_list = common.update_group_list(result)

        reward += result[0].reward
        ob = (result[0].observation["screen"][
              _PLAYER_RELATIVE:_PLAYER_RELATIVE + 1] == 3).astype(int)
        selected = result[0].observation["screen"][
                   _SELECTED:_SELECTED + 1]  #  (1, 32, 32)
        # extra = np.zeros((1, 32, 32))
        control_groups = result[0].observation["control_groups"]
        army_count = env._obs[0].observation.player_common.army_count

        done = result[0].step_type == environment.StepType.LAST
        info = result[0].observation["available_actions"]
        available_actions = result[0].observation["available_actions"]
        remote.send((ob, reward, done, info, army_count,
                     control_groups, selected, xy_per_marine))
      elif cmd == 'close':
        remote.close()
        break
      elif cmd == 'get_spaces':
        remote.send((env.action_spec().functions[data], ""))
      elif cmd == "action_spec":
        remote.send((env.action_spec().functions[data]))
      else:
        raise NotImplementedError