Ejemplo n.º 1
0
    def __init__(self,
                 sess,
                 log_path,
                 env_function,
                 num_cpu,
                 network="fullyconv",
                 ar=True,
                 lr=1e-4,
                 optimizer="rmsprop",
                 ent_coef=1e-3,
                 vf_coef=1.0,
                 max_grad_norm=0.5,
                 nsteps=5,
                 nstack=1,
                 gamma=0.99):

        if optimizer == "adam":
            self.trainer = tf.train.AdamOptimizer
        elif optimizer == "rmsprop":
            self.trainer = tf.train.RMSPropOptimizer
        else:
            raise NotImplementedError

        network_func = None
        if network == "fullyconv":
            network_func = networks.FullyConvNet
        elif network == "atari":
            network_func = networks.AtariNet
        else:
            raise NotImplementedError

        self.sess = sess
        self.log_path = log_path
        self.num_cpu = num_cpu
        self.env_function = env_function
        self.init_lr = lr
        self.env = SubprocVecEnv([self.env_function(i) for i in range(1)])
        self.model = Model(network_func=network_func,
                           screen_space=self.env.screen_space,
                           minimap_space=self.env.minimap_space,
                           ns_space=self.env.ns_space,
                           trainer=self.trainer,
                           ar=ar,
                           nstack=nstack,
                           ent_coef=ent_coef,
                           vf_coef=vf_coef,
                           max_grad_norm=max_grad_norm)
        self.gamma = gamma
        self.nsteps = nsteps
        self.ar = ar

        if ar:
            self.step_func = self.step_policy_ar
        else:
            self.step_func = self.step_policy
def make_vec_env(env_id, env_type, num_env, seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()
    def make_thunk(rank):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            logger_dir=logger_dir
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_thunk(start_index)])
Ejemplo n.º 3
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0,
                 flatten_dict_observations=True,
                 gamestate=None,
                 cloth_cfg_path=None,
                 render_path=None,
                 start_state_path=None):
    """Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.

    Daniel: the above docs from baselines seems out of date, ALL types go here?
    Also, we're adding arguments for the cloth env: the config path, the render
    path, and the starting state path (last one is optional for the cloth).
    """
    wrapper_kwargs = wrapper_kwargs or {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
    seed = seed + 10000 * mpi_rank if seed is not None else None
    logger_dir = logger.get_dir()

    def make_thunk(rank,
                   cloth_cfg_path=None,
                   render_path=None,
                   start_state_path=None):
        return lambda: make_env(
            env_id=env_id,
            env_type=env_type,
            mpi_rank=mpi_rank,
            subrank=rank,
            seed=seed,
            reward_scale=reward_scale,
            gamestate=gamestate,
            flatten_dict_observations=flatten_dict_observations,
            wrapper_kwargs=wrapper_kwargs,
            logger_dir=logger_dir,
            cloth_cfg_path=cloth_cfg_path,
            render_path=render_path,
            start_state_path=start_state_path,
        )

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv([
            make_thunk(
                i + start_index,
                cloth_cfg_path=cloth_cfg_path,
                render_path=None,  # Daniel: for now
                start_state_path=start_state_path) for i in range(num_env)
        ])
    else:
        return DummyVecEnv([
            make_thunk(start_index,
                       cloth_cfg_path,
                       render_path=render_path,
                       start_state_path=start_state_path)
        ])
Ejemplo n.º 4
0
def main():
    FLAGS(sys.argv)
    env = SubprocVecEnv(1, 'CollectMineralShards')
    env.reset()
    total_reward = 0
    for _ in range(1000):
        marine = random.randrange(2)
        x = random.randrange(32)
        y = random.randrange(32)
        print('Move %d to (%d, %d)' % (marine, x, y))
        move_action = construct_action(marine, x, y)
        # This controls the APM.
        for _ in range(7):
            obs, rs, dones, _, _, _, selected, screens = env.step(
                [move_action])
            total_reward += rs
        # Querying the position
        m_pos = {}
        m_pos['0'], rs, dones = get_position(env, 0)
        total_reward += rs
        m_pos['1'], rs, dones = get_position(env, 1)
        total_reward += rs

        print(rs)
        print(dones)
        print('Total reward: ', total_reward)
        print(m_pos)

    env.close()
Ejemplo n.º 5
0
def main():
  FLAGS(sys.argv)
  env = SubprocVecEnv(1, 'CollectMineralShards')
  env.reset()
  total_reward = 0
  for _ in range(1000):
    marine = random.randrange(2)
    x = random.randrange(32)
    y = random.randrange(32)
    print('Move %d to (%d, %d)' % (marine, x, y))
    move_action = construct_action(marine, x, y)
    # This controls the APM.
    for _ in range(7):
      obs, rs, dones, _, _, _, selected, screens = env.step([move_action])
      total_reward += rs
    # Querying the position
    m_pos = {}
    m_pos['0'], rs, dones = get_position(env, 0)
    total_reward += rs
    m_pos['1'], rs, dones = get_position(env, 1)
    total_reward += rs

    print(rs)
    print(dones)
    print('Total reward: ', total_reward)
    print(m_pos)

  env.close()
Ejemplo n.º 6
0
        def make_vec_envs(evaluation):
            def env_thunk(rank):
                return lambda: self.make_env(seed=int(seed),
                                             rank=rank,
                                             evaluation=evaluation,
                                             env_id=env_id)

            env_fns = [env_thunk(i) for i in range(num_processes)]
            use_dummy = len(
                env_fns) == 1 or sys.platform == "darwin" or synchronous
            return VecPyTorch(
                DummyVecEnv(env_fns, render=render
                            ) if use_dummy else SubprocVecEnv(env_fns))
Ejemplo n.º 7
0
    def make_vec_envs(
        self,
        num_processes,
        gamma,
        render,
        synchronous,
        env_id,
        add_timestep,
        seed,
        evaluation,
        time_limit,
        num_frame_stack=None,
        **env_args,
    ):
        envs = [
            functools.partial(  # thunk
                self.make_env,
                rank=i,
                env_id=env_id,
                add_timestep=add_timestep,
                seed=seed,
                evaluation=evaluation,
                time_limit=time_limit,
                evaluating=evaluation,
                **env_args,
            ) for i in range(num_processes)
        ]

        if len(envs) == 1 or sys.platform == "darwin" or synchronous:
            envs = DummyVecEnv(envs, render=render)
        else:
            envs = SubprocVecEnv(envs)

        # if (
        # envs.observation_space.shape
        # and len(envs.observation_space.shape) == 1
        # ):
        # if gamma is None:
        # envs = VecNormalize(envs, ret=False)
        # else:
        # envs = VecNormalize(envs, gamma=gamma)

        envs = VecPyTorch(envs)

        if num_frame_stack is not None:
            envs = VecPyTorchFrameStack(envs, num_frame_stack)
        # elif len(envs.observation_space.shape) == 3:
        #     envs = VecPyTorchFrameStack(envs, 4, device)

        return envs
Ejemplo n.º 8
0
def build_env4gail(args, nenv):
    def make_env():
        def _thunk():
            env = gym.make(args.env_id)
            env.seed(args.seed)  # to make the result more reproducibility
            env = Monitor(env, logger.get_dir(), allow_early_resets=True)
            return env

        return _thunk

    envs = [make_env() for i in range(nenv)]
    envs = SubprocVecEnv(envs)
    envs = VecNormalize(envs)

    return envs
Ejemplo n.º 9
0
def make_atari_env(env_id, num_threads, seed, frame_stack=4):
    game_lives = gym.make(env_id).unwrapped.ale.lives()
    game_lives = game_lives if game_lives != 0 else 1

    def make_env(rank):
        def _thunk():
            env = wrappers.make_atari(env_id)
            env.seed(seed + rank)
            return wrappers.wrap_deepmind(env)

        return _thunk

    np.random.seed(seed)
    env = SubprocVecEnv([make_env(i) for i in range(num_threads)])
    env = VecFrameStack(env, frame_stack)
    return env, game_lives
Ejemplo n.º 10
0
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id)
            env.seed(seed + rank)
            env = Monitor(
                env,
                logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
            return wrap_deepmind(env, **wrapper_kwargs)

        return _thunk

    set_global_seeds(seed)
    return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
Ejemplo n.º 11
0
def test_env_after_learn(algo):
    def make_env():
        # acktr requires too much RAM, fails on travis
        env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
        return env

    make_session(make_default=True, graph=tf.Graph())
    env = SubprocVecEnv([make_env])

    learn = get_learn_function(algo)

    # Commenting out the following line resolves the issue, though crash happens at env.reset().
    learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)

    env.reset()
    env.close()
Ejemplo n.º 12
0
def make_vec_env(env_id,
                 env_type,
                 num_env,
                 seed,
                 wrapper_kwargs=None,
                 start_index=0,
                 reward_scale=1.0):
    """
    Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
    """
    if wrapper_kwargs is None: wrapper_kwargs = {}
    mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0

    def make_env(rank):  # pylint: disable=C0111
        def _thunk():
            env = make_atari(env_id) if env_type == 'atari' else gym.make(
                env_id)
            env.seed(seed + 10000 * mpi_rank +
                     rank if seed is not None else None)
            env = Monitor(env,
                          logger.get_dir()
                          and os.path.join(logger.get_dir(),
                                           str(mpi_rank) + '.' + str(rank)),
                          allow_early_resets=True)

            if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs)
            elif reward_scale != 1: return RewardScaler(env, reward_scale)
            else: return env

        return _thunk

    set_global_seeds(seed)
    if num_env > 1:
        return SubprocVecEnv(
            [make_env(i + start_index) for i in range(num_env)])
    else:
        return DummyVecEnv([make_env(start_index)])
Ejemplo n.º 13
0
class Agent(object):
    def __init__(self,
                 sess,
                 log_path,
                 env_function,
                 num_cpu,
                 network="fullyconv",
                 ar=True,
                 lr=1e-4,
                 optimizer="rmsprop",
                 ent_coef=1e-3,
                 vf_coef=1.0,
                 max_grad_norm=0.5,
                 nsteps=5,
                 nstack=1,
                 gamma=0.99):

        if optimizer == "adam":
            self.trainer = tf.train.AdamOptimizer
        elif optimizer == "rmsprop":
            self.trainer = tf.train.RMSPropOptimizer
        else:
            raise NotImplementedError

        network_func = None
        if network == "fullyconv":
            network_func = networks.FullyConvNet
        elif network == "atari":
            network_func = networks.AtariNet
        else:
            raise NotImplementedError

        self.sess = sess
        self.log_path = log_path
        self.num_cpu = num_cpu
        self.env_function = env_function
        self.init_lr = lr
        self.env = SubprocVecEnv([self.env_function(i) for i in range(1)])
        self.model = Model(network_func=network_func,
                           screen_space=self.env.screen_space,
                           minimap_space=self.env.minimap_space,
                           ns_space=self.env.ns_space,
                           trainer=self.trainer,
                           ar=ar,
                           nstack=nstack,
                           ent_coef=ent_coef,
                           vf_coef=vf_coef,
                           max_grad_norm=max_grad_norm)
        self.gamma = gamma
        self.nsteps = nsteps
        self.ar = ar

        if ar:
            self.step_func = self.step_policy_ar
        else:
            self.step_func = self.step_policy

    def training_process(self, epoch, train_steps, epsilon):
        # set learning rate
        current_lr = self.init_lr * (0.98**epoch)
        current_lr = max(current_lr, self.init_lr / 2.)

        mb_screen, mb_minimap, mb_ns, mb_rewards, mb_actions, mb_available_actions, mb_pos, mb_values, mb_dones = [],[],[],[],[],[],[],[],[]
        mb_args, mb_args_used = dict(), dict()
        mb_use_spatial_actions = []
        for act_type in actions.TYPES:
            mb_args[act_type.name] = []
            mb_args_used[act_type.name] = []
        mb_states = []

        self.epsilon = epsilon
        self.remake_env(self.num_cpu)
        obs, info = self.env.reset()
        screen, minimap, ns, available_actions = obs["screen"], obs[
            "minimap"], obs["ns"], info["available_actions"]
        states = None
        update_steps = 0
        start_time = time.time()
        print("=== Training Epoch: ", epoch, ", Learning Rate: ", current_lr,
              " ===")
        # with tqdm(total=train_steps) as pbar:
        while (True):
            print_log = False

            if len(mb_screen) == self.nsteps - 1:
                print_log = True

            action, arg, value, state = self.step_func(
                screen,
                minimap,
                ns,
                states,
                print_log=print_log,
                epsilon=epsilon,
                available_actions=available_actions)
            # action, arg, value, state = self.step_epsilon(screen, minimap, ns, states, print_log=print_log, epsilon=epsilon, available_actions=available_actions)
            mb_screen.append(np.copy(screen))
            mb_minimap.append(np.copy(minimap))
            mb_ns.append(np.copy(ns))
            mb_available_actions.append(np.copy(available_actions))
            mb_actions.append(action)
            # for a in arg:
            for act_type in actions.TYPES:
                temp, temp_used = [], []
                for a in arg:
                    if a[act_type.name] != -1:
                        temp.append(a[act_type.name])
                        temp_used.append(1.)
                    else:
                        temp.append(0)
                        temp_used.append(0.)
                mb_args[act_type.name].append(temp)
                mb_args_used[act_type.name].append(temp_used)

            mb_values.append(value)
            mb_dones.append(info["last"])
            next_obs, info = self.env.step(action, arg)
            '''
            # This part seems useless. Check later.
            for idx, done in enumerate(info["last"]):
                if done:
                    obs[idx] = obs[idx] * 0
            '''
            obs = next_obs
            mb_rewards.append(info["reward"])
            screen, minimap, ns, available_actions = obs["screen"], obs[
                "minimap"], obs["ns"], info["available_actions"]

            if len(mb_screen) == self.nsteps:
                mb_dones.append(info["last"])
                mb_screen = np.asarray(mb_screen, dtype=np.float32).swapaxes(
                    1, 0).reshape((self.num_cpu * self.nsteps, ) +
                                  self.env.screen_space)
                mb_minimap = np.asarray(mb_minimap, dtype=np.float32).swapaxes(
                    1, 0).reshape((self.num_cpu * self.nsteps, ) +
                                  self.env.minimap_space)
                mb_ns = np.asarray(mb_ns, dtype=np.float32).swapaxes(
                    1, 0).reshape((self.num_cpu * self.nsteps, ) +
                                  self.env.ns_space)
                mb_available_actions = np.asarray(
                    mb_available_actions, dtype=np.float32).swapaxes(
                        1, 0).reshape((self.num_cpu * self.nsteps, ) +
                                      (len(pysc2.lib.actions.FUNCTIONS), ))
                mb_rewards = np.asarray(mb_rewards,
                                        dtype=np.float32).swapaxes(1, 0)
                mb_actions = np.asarray(mb_actions,
                                        dtype=np.int32).swapaxes(1, 0)
                for act_type in actions.TYPES:
                    mb_args[act_type.name] = np.asarray(
                        mb_args[act_type.name], dtype=np.int32).swapaxes(1, 0)
                    mb_args_used[act_type.name] = np.asarray(
                        mb_args_used[act_type.name],
                        dtype=np.float32).swapaxes(1, 0)
                mb_values = np.asarray(mb_values,
                                       dtype=np.float32).swapaxes(1, 0)
                mb_dones = np.asarray(mb_dones,
                                      dtype=np.float32).swapaxes(1, 0)
                mb_masks = mb_dones[:, :-1]
                mb_dones = mb_dones[:, 1:]
                last_values = self.value(screen, minimap, ns).tolist()
                for n, (rewards, dones, value) in enumerate(
                        zip(mb_rewards, mb_dones, last_values)):
                    rewards = rewards.tolist()
                    dones = dones.tolist()
                    if dones[-1] == 0:
                        rewards = discount_with_dones(rewards + [value],
                                                      dones + [0],
                                                      self.gamma)[:-1]
                    else:
                        rewards = discount_with_dones(rewards, dones,
                                                      self.gamma)
                    mb_rewards[n] = rewards
                mb_rewards = mb_rewards.flatten()
                mb_actions = mb_actions.flatten()
                for act_type in actions.TYPES:
                    mb_args[act_type.name] = mb_args[act_type.name].flatten()
                    mb_args_used[act_type.name] = mb_args_used[
                        act_type.name].flatten()
                mb_values = mb_values.flatten()
                mb_masks = mb_masks.flatten()
                self.train(current_lr, mb_screen, mb_minimap, mb_ns, mb_states,
                           mb_rewards, mb_masks, mb_actions,
                           mb_use_spatial_actions, mb_available_actions,
                           mb_pos, mb_values, mb_args, mb_args_used)
                update_steps += 1
                # pbar.update(1)
                mb_screen, mb_minimap, mb_ns, mb_rewards, mb_actions, mb_available_actions, mb_pos, mb_values, mb_dones = [],[],[],[],[],[],[],[],[]
                mb_args, mb_args_used = dict(), dict()
                for act_type in actions.TYPES:
                    mb_args[act_type.name] = []
                    mb_args_used[act_type.name] = []

            if update_steps == train_steps:
                break

        self.env.close()
        print("=== Took ", (time.time() - start_time), " seconds to finish ",
              train_steps, " updates.===")

    def evaluating_process(self, epoch, episodes):
        # Since the game lengths are different for each game, only use one thread when evaluating
        self.remake_env(1)
        rewards = []
        for _ in range(episodes):
            episode_reward = [0]
            obs, info = self.env.reset()
            screen, minimap, ns, available_actions = obs["screen"], obs[
                "minimap"], obs["ns"], info["available_actions"]
            states = None
            while True:
                action, arg, value, state = self.step_func(
                    screen,
                    minimap,
                    ns,
                    states,
                    available_actions=available_actions)
                obs, info = self.env.step(action, arg)
                episode_reward = [
                    sum(x) for x in zip(episode_reward, info["reward"])
                ]
                screen, minimap, ns, available_actions = obs["screen"], obs[
                    "minimap"], obs["ns"], info["available_actions"]
                if info["last"][0]:
                    rewards.append(episode_reward)
                    break
        rewards = [r for sublist in rewards for r in sublist]
        self.env.save_replay("%sreplay" % self.log_path, epoch)
        self.env.close()
        return rewards

    def step_policy_ar(self, screen, minimap, ns, *_args, **_kwargs):
        a_s, a_probs, v_s, args = self.sess.run(
            [
                self.model.pi_selected, self.model.base_action_softmax,
                self.model.value, self.model.args_selected
            ], {
                self.model.screen: screen,
                self.model.minimap: minimap,
                self.model.ns: ns,
                self.model.act_mask: _kwargs["available_actions"]
            })

        a_s = np.reshape(a_s, -1)
        a_probs = np.reshape(a_probs, (-1, self.env.base_action_count))

        aid_s, args_s = [], []
        for idx in range(len(a_s)):
            aid = a_s[idx]
            aid_s.append(aid)
            temp_args = dict()
            for k, v in args.items():
                temp_args[k] = -1
            for arg in actions.FUNCTIONS[aid].args:
                temp_args[arg.name] = np.reshape(args[arg.name], -1)[idx]
            args_s.append(temp_args)

        if _kwargs.get('print_log', False):
            if args_s[0]["screen"] != -1:
                print("AR action: ",
                      aid_s[0],
                      " action_prob: ",
                      a_probs[0][aid_s[0]],
                      " pos: (",
                      args_s[0]["screen"] % self.env.screen_space[1],
                      ",",
                      args_s[0]["screen"] // self.env.screen_space[1],
                      ") ",
                      end='')
            else:
                print("AR action: ",
                      aid_s[0],
                      " action_prob: ",
                      a_probs[0][aid_s[0]],
                      end='')

        return aid_s, args_s, v_s, []

    def step_policy(self, screen, minimap, ns, *_args, **_kwargs):
        a_s, v_s, args = self.sess.run(
            [
                self.model.base_action_softmax, self.model.value,
                self.model.args
            ], {
                self.model.screen: screen,
                self.model.minimap: minimap,
                self.model.ns: ns
            })

        available_actions = _kwargs["available_actions"]
        filtered_a = np.multiply(a_s, available_actions)
        filtered_a /= np.sum(filtered_a, axis=1, keepdims=True)

        aid_s, args_s = [], []
        for idx in range(np.shape(filtered_a)[0]):
            aid = np.random.choice(len(filtered_a[idx, :]),
                                   p=filtered_a[idx, :])
            aid_s.append(aid)
            temp_args = dict()
            # initialize all arguments to -1
            for k, v in args.items():
                temp_args[k] = -1
            # only sample needed arguments
            for arg in actions.FUNCTIONS[aid].args:
                temp_args[arg.name] = np.random.choice(len(
                    args[arg.name][idx]),
                                                       p=args[arg.name][idx])
            args_s.append(temp_args)

        if _kwargs.get('print_log', False):
            if args_s[0]["screen"] != -1:
                print("action: ",
                      aid_s[0],
                      " action_prob: ",
                      filtered_a[0][aid_s[0]],
                      " pos: (",
                      args_s[0]["screen"] % self.env.screen_space[1],
                      ",",
                      args_s[0]["screen"] // self.env.screen_space[1],
                      ") pos_prob: ",
                      args["screen"][0][args_s[0]["screen"]],
                      end='')
            else:
                print("action: ",
                      aid_s[0],
                      " action_prob: ",
                      filtered_a[0][aid_s[0]],
                      end='')

        return aid_s, args_s, v_s, []

    '''
    def step_policy(self, screen, minimap, ns, *_args, **_kwargs):
        a_s, v_s, pos_s = self.sess.run([self.model.policy_a, self.model.value, self.model.policy_pos],
            {self.model.screen: screen, self.model.minimap: minimap, self.model.ns: ns, self.model.AVAIL_ACTION: _kwargs["available_actions"]})

        if _kwargs.get('print_log', False):
            print ("action: ", a_s[0], " pos: (", pos_s[0] % 64, ", ", pos_s[0] // 64, ")")
        # print (np.shape(a_s))
        # input()

        return a_s, pos_s, v_s, [] 
    '''

    def step_epsilon(self, screen, minimap, ns, *_args, **_kwargs):
        a_s, v_s, args = self.sess.run(
            [
                self.model.base_action_softmax, self.model.value,
                self.model.args
            ], {
                self.model.screen: screen,
                self.model.minimap: minimap,
                self.model.ns: ns
            })

        available_actions = _kwargs["available_actions"]
        filtered_a = np.multiply(a_s, available_actions)
        filtered_a /= np.sum(filtered_a, axis=1, keepdims=True)

        aid_s, args_s = [], []
        for idx in range(np.shape(filtered_a)[0]):
            aid = None
            if np.random.uniform() < self.epsilon:
                available_act_ids = np.nonzero(
                    _kwargs["available_actions"][idx])[0]
                aid = np.random.choice(available_act_ids)
                # print ("Random action:", aid)
            else:
                aid = np.random.choice(len(filtered_a[idx, :]),
                                       p=filtered_a[idx, :])
            aid_s.append(aid)
            temp_args = dict()
            # initialize all arguments to -1
            for k, v in args.items():
                temp_args[k] = -1
            # only sample needed arguments
            for arg in actions.FUNCTIONS[aid].args:
                temp_args[arg.name] = np.random.choice(len(
                    args[arg.name][idx]),
                                                       p=args[arg.name][idx])
            args_s.append(temp_args)

        if _kwargs.get('print_log', False):
            if args_s[0]["screen"] != -1:
                print("action: ",
                      aid_s[0],
                      " action_prob: ",
                      filtered_a[0][aid_s[0]],
                      " pos: (",
                      args_s[0]["screen"] % self.env.screen_space[1],
                      ",",
                      args_s[0]["screen"] // self.env.screen_space[1],
                      ") pos_prob: ",
                      args["screen"][0][args_s[0]["screen"]],
                      end='')
            else:
                print("action: ",
                      aid_s[0],
                      " action_prob: ",
                      filtered_a[0][aid_s[0]],
                      end='')

        return aid_s, args_s, v_s, []

    def train(self, lr, screen, minimap, ns, states, rewards, masks, acts,
              use_spatial_actions, available_actions, pos, values, args,
              args_used):
        advs = rewards - values
        td_map = {
            self.model.screen: screen,
            self.model.minimap: minimap,
            self.model.ns: ns,
            self.model.acts: acts,
            # self.model.avail_actions: available_actions,
            self.model.act_mask: available_actions,
            self.model.advs: advs,
            self.model.rewards: rewards,
            self.model.lr: lr
        }
        for act_type in actions.TYPES:
            td_map[self.model.act_args[act_type.name]] = args[act_type.name]
            td_map[self.model.act_args_used[act_type.name]] = args_used[
                act_type.name]

        _, pg_loss, neglogpac, entropy, vf_loss = self.sess.run([
            self.model._train, self.model.pg_loss, self.model.neglogpac,
            self.model.entropy, self.model.vf_loss
        ], td_map)
        print(" pg_loss: ", pg_loss, " entropy: ", entropy, " vf_loss: ",
              vf_loss)

    def value(self, screen, minimap, ns, *_args, **_kwargs):
        v = self.sess.run(
            self.model.value, {
                self.model.screen: screen,
                self.model.minimap: minimap,
                self.model.ns: ns
            })
        return v

    def save_model(self, epoch):
        ps = self.sess.run(self.model.params)
        # make_path(save_path)
        open("%s%d.checkpoint" % (self.log_path, epoch), "w+")
        joblib.dump(ps, "%s/%d.checkpoint" % (self.log_path, epoch))

    def remake_env(self, num_cpu):
        self.env.close()
        self.env = SubprocVecEnv(
            [self.env_function(i) for i in range(num_cpu)])
Ejemplo n.º 14
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            visualize=True,
                            screen_size_px=(16, 16),
                            minimap_size_px=(16, 16)) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "deepq-4way"):

        with sc2_env.SC2Env(map_name="CollectMineralGas",
                            step_mul=step_mul,
                            screen_size_px=(32, 32),
                            minimap_size_px=(32, 32),
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
Ejemplo n.º 15
0
def main():
    FLAGS(sys.argv)

    steps = 0  #Test steps

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if FLAGS.lr == 0:
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if FLAGS.algorithm == "deepq-4way":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "deepq":
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif FLAGS.algorithm == "a2c":
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if FLAGS.log == "tensorboard":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif FLAGS.log == "stdout":
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if FLAGS.algorithm == "deepq":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=16, minimap=16))
        # temp solution - sc2_env.Agent(sc2_env.Race.terran) might be too restricting
        # We need this change because sc2 now requires specifying players.
        with sc2_env.SC2Env(
                map_name="Simple64",
                players=[
                    sc2_env.Agent(race=sc2_env.Race.terran),
                    sc2_env.Agent(race=sc2_env.Race.terran)
                ],
                #players=[sc2_env.Agent(sc2_env.Race.terran),sc2_env.Agent(sc2_env.Race.terran)],
                step_mul=step_mul,
                visualize=True,
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            acts = deepq_nexus_wars.learn(
                env,
                q_func=model,
                num_actions=16,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_callback)

            agent = random_agent.RandomAgent()
            run_loop.run_loop([agent], env, steps)

            acts[0].save("mineral_shards_x.pkl")
            acts[1].save("mineral_shards_y.pkl")

    elif FLAGS.algorithm == "deepq-4way":

        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(map_name="Simple64",
                            players=[
                                sc2_env.Agent(race=sc2_env.Race.terran),
                                sc2_env.Agent(race=sc2_env.Race.terran)
                            ],
                            step_mul=step_mul,
                            agent_interface_format=AGENT_INTERFACE_FORMAT,
                            visualize=True) as env:

            model = cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                               hiddens=[256],
                               dueling=True)

            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif FLAGS.algorithm == "a2c":

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)
Ejemplo n.º 16
0
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_cpu : %s" % FLAGS.num_cpu)
    print("lr : %s" % FLAGS.lr)

    logdir = "tensorboard"
    if (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, FLAGS.lr, start_time)
    elif (FLAGS.algorithm == "acktr"):
        logdir = "tensorboard/mineral/%s/%s_num%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.num_cpu, FLAGS.lr,
            start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):

        with sc2_env.SC2Env("CollectMineralShards",
                            step_mul=step_mul,
                            visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)

            act = deepq_mineral_shards.learn(env,
                                             q_func=model,
                                             num_actions=64,
                                             lr=1e-3,
                                             max_timesteps=20000000,
                                             buffer_size=10000,
                                             exploration_fraction=0.5,
                                             exploration_final_eps=0.01,
                                             train_freq=4,
                                             learning_starts=10000,
                                             target_network_update_freq=1000,
                                             gamma=0.99,
                                             prioritized_replay=True,
                                             callback=deepq_callback)
            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "acktr"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        # def make_env(rank):
        #   # env = sc2_env.SC2Env(
        #   #   "CollectMineralShards",
        #   #   step_mul=step_mul)
        #   # return env
        #   #env.seed(seed + rank)
        #   def _thunk():
        #     env = sc2_env.SC2Env(
        #         map_name=FLAGS.map,
        #         step_mul=step_mul,
        #         visualize=True)
        #     #env.seed(seed + rank)
        #     if logger.get_dir():
        #      env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
        #     return env
        #   return _thunk

        # agents = [Agent()
        #           for _ in range(num_cpu)]
        #
        # for agent in agents:
        #   time.sleep(1)
        #   agent.daemon = True
        #   agent.start()

        # agent_controller = AgentController(agents)

        #set_global_seeds(seed)
        env = SubprocVecEnv(FLAGS.num_cpu, FLAGS.map)

        policy_fn = CnnPolicy
        acktr_disc.learn(policy_fn,
                         env,
                         seed,
                         total_timesteps=num_timesteps,
                         nprocs=FLAGS.num_cpu,
                         ent_coef=0.1,
                         callback=acktr_callback)
Ejemplo n.º 17
0
 def remake_env(self, num_cpu):
     self.env.close()
     self.env = SubprocVecEnv(
         [self.env_function(i) for i in range(num_cpu)])
def main():
    FLAGS(sys.argv)

    print("algorithm : %s" % FLAGS.algorithm)
    print("timesteps : %s" % FLAGS.timesteps)
    print("exploration_fraction : %s" % FLAGS.exploration_fraction)
    print("prioritized : %s" % FLAGS.prioritized)
    print("dueling : %s" % FLAGS.dueling)
    print("num_agents : %s" % FLAGS.num_agents)
    print("lr : %s" % FLAGS.lr)

    if (FLAGS.lr == 0):
        FLAGS.lr = random.uniform(0.00001, 0.001)

    print("random lr : %s" % FLAGS.lr)

    lr_round = round(FLAGS.lr, 8)

    logdir = "tensorboard"

    if (FLAGS.algorithm == "deepq-4way"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "deepq"):
        logdir = "tensorboard/mineral/%s/%s_%s_prio%s_duel%s_lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps, FLAGS.exploration_fraction,
            FLAGS.prioritized, FLAGS.dueling, lr_round, start_time)
    elif (FLAGS.algorithm == "a2c"):
        logdir = "tensorboard/mineral/%s/%s_n%s_s%s_nsteps%s/lr%s/%s" % (
            FLAGS.algorithm, FLAGS.timesteps,
            FLAGS.num_agents + FLAGS.num_scripts, FLAGS.num_scripts,
            FLAGS.nsteps, lr_round, start_time)

    if (FLAGS.log == "tensorboard"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[TensorBoardOutputFormat(logdir)])

    elif (FLAGS.log == "stdout"):
        Logger.DEFAULT \
          = Logger.CURRENT \
          = Logger(dir=None,
                   output_formats=[HumanOutputFormat(sys.stdout)])

    if (FLAGS.algorithm == "deepq"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(  #interface.feature_layer.resolution 和  interface.feature_layer.minimap_resolution
            feature_dimensions=sc2_env.Dimensions(screen=32,
                                                  minimap=32)  # 16 16
            # feature_dimensions = sc2_env.Dimensions(screen=32, minimap=32)  # 16 16
        )
        with sc2_env.SC2Env(
                map_name="CollectMineralShards",
                step_mul=step_mul,  #推进的速度,通俗理解就是人类玩家的每秒的有效操作
                visualize=True,
                # screen_size_px=(16, 16),
                # minimap_size_px=(16, 16)) as env:
                agent_interface_format=AGENT_INTERFACE_FORMAT) as env:

            model = deepq.models.cnn_to_mlp(  #his model takes as input an observation and returns values of all actions.注意如何在deepq_mineral_shards.learn用到该model
                convs=[(16, 8, 4), (32, 4, 2)],
                hiddens=[256],
                dueling=True)  #卷积核数量,卷积核大小,步长
            # convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[512], dueling=True)  # 卷积核数量,卷积核大小,步长
            act = deepq_mineral_shards.learn(  #训练模型并保存
                # act = deepq_ActSeparate.learn(  #训练模型并保存
                # act=deepq_actSeparateWith4Directions.learn(
                # act = deepq_actionGroup_4way.learn(
                # act = deep_DiffActInSameTime.learn(
                env,
                q_func=model,
                num_actions=4,  #default 16  num_actions=256   3  4
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_actSeparateWith4Directions_callback
            )  #deepq_callback; deepq_ActSeperate_callback  ;   deepq_actSeparateWith4Directions_callback  deep_DiffActInSameTime_callback
            act.save(
                "mineral_shards.pkl"
            )  #在所有训练步骤之后将训练过的模型保存到mineral_shards.pkl文件中, 用于enjoy_mineral_shards.py

    elif (FLAGS.algorithm == "deepq-4way"):
        AGENT_INTERFACE_FORMAT = sc2_env.AgentInterfaceFormat(
            feature_dimensions=sc2_env.Dimensions(screen=32, minimap=32))
        with sc2_env.SC2Env(  #
                map_name="CollectMineralShards",
                step_mul=step_mul,
                # screen_size_px=(32, 32),
                # minimap_size_px=(32, 32),
                save_replay_episodes=2,
                replay_dir="D:/StarCraft II/StarCraft II/video",
                agent_interface_format=AGENT_INTERFACE_FORMAT,
                visualize=True) as env:

            model = deepq.models.cnn_to_mlp(convs=[(16, 8, 4), (32, 4, 2)],
                                            hiddens=[256],
                                            dueling=True)
            # model = deepq.models.mlp(hiddens=[256,128,4])
            act = deepq_mineral_4way.learn(
                env,
                q_func=model,
                num_actions=4,
                lr=FLAGS.lr,
                max_timesteps=FLAGS.timesteps,
                buffer_size=10000,
                exploration_fraction=FLAGS.exploration_fraction,
                exploration_final_eps=0.01,
                train_freq=4,
                learning_starts=10000,
                target_network_update_freq=1000,
                gamma=0.99,
                prioritized_replay=True,
                callback=deepq_4way_callback)

            act.save("mineral_shards.pkl")

    elif (FLAGS.algorithm == "a2c"):

        num_timesteps = int(40e6)

        num_timesteps //= 4

        seed = 0

        env = SubprocVecEnv(FLAGS.num_agents + FLAGS.num_scripts,
                            FLAGS.num_scripts, FLAGS.map)

        policy_fn = CnnPolicy
        a2c.learn(policy_fn,
                  env,
                  seed,
                  total_timesteps=num_timesteps,
                  nprocs=FLAGS.num_agents + FLAGS.num_scripts,
                  nscripts=FLAGS.num_scripts,
                  ent_coef=0.5,
                  nsteps=FLAGS.nsteps,
                  max_grad_norm=0.01,
                  callback=a2c_callback)