def _data_stream_from_replay_buffer(replay_buffer,
                                     batch_size,
                                     drop_remainder=False):
     data_iterator = iter(replay_buffer)
     batch_samples = []
     try:
         while True:
             for _ in range(batch_size):
                 next_el = next(data_iterator)
                 batch_samples.append(next_el)
             yield data.nested_stack(batch_samples)
             batch_samples = []
     except StopIteration:
         if not drop_remainder and len(batch_samples) > 0:
             yield data.nested_stack(batch_samples)
Exemple #2
0
    def batched_request(self):
        """Batches requests and returns batched request."""
        if self._batched_request is not None:
            return self._batched_request

        # Request used as a filler for coroutines that have already
        # finished.
        filler = next(x for x in self._requests if x is not None)
        # Fill with 0s for easier debugging.
        filler = data.nested_map(np.zeros_like, filler)

        # Substitute the filler for Nones.
        self._requests = [
            x if x is not None else filler for x in self._requests
        ]

        def assert_not_scalar(x):
            assert np.array(x).shape, (
                'All arrays in a PredictRequest must be at least rank 1.')

        data.nested_map(assert_not_scalar, self._requests)

        def flatten_first_2_dims(x):
            return np.reshape(x, (-1, ) + x.shape[2:])

        # Stack instead of concatenate to ensure that all requests have
        # the same shape.
        self._batched_request = data.nested_stack(self._requests)
        # (n_agents, n_requests, ...) -> (n_agents * n_requests, ...)
        self._batched_request = data.nested_map(flatten_first_2_dims,
                                                self._batched_request)
        return self._batched_request
def construct_episodes(actions, rewards, **kwargs):
    """Constructs episodes from actions and rewards nested lists.

    Args:
        actions (list): Each episode actions, example:
        [
            [a00, a01, a02, ...], # Actions in the first episode.
            [a10, a11, a12, ...], # Actions in the second episode.
            ...
        ]
        rewards (list): Each episode rewards, example:
        [
            [r00, r01, r02, ...], # Rewards in the first episode.
            [r10, r11, r12, ...], # Rewards in the second episode.
            ...
        ]
        **kwargs (dict): Keyword arguments passed to Episode.

    Return:
        list of Episodes where:
         - Transition observations and next observations are set to None.
         - Done flag is True only for the last transition in the episode.
         - Episode.return_ is calculated as an undiscounted sum of rewards.
    """
    episodes = []
    for acts, rews in zip(actions, rewards):
        transitions = [
            data.Transition(None, act, rew, False, None, {}, {})
            for act, rew in zip(acts[:-1], rews[:-1])
        ]
        transitions.append(
            data.Transition(None, acts[-1], rews[-1], True, None, {}, {}))
        transition_batch = data.nested_stack(transitions)
        episodes.append(data.Episode(transition_batch, sum(rews), **kwargs))
    return episodes
Exemple #4
0
 def sample(self, batch_size):
     batch, idxs, is_weights = self.memory.sample(batch_size)
     x, y = data.nested_stack(batch)
     is_weights_per_output = {
         output_name: np.expand_dims(is_weights, axis=-1)
         for output_name in y.keys()
     }
     idxs = np.expand_dims(idxs, axis=-1)
     return x, y, idxs, is_weights_per_output
Exemple #5
0
def construct_episodes(actions, rewards):
    """Constructs episodes from actions and rewards nested lists."""
    episodes = []
    for acts, rews in zip(actions, rewards):
        transitions = [
            # TODO(koz4k): Initialize using kwargs.
            data.Transition(None, act, rew, False, None, {})
            for act, rew in zip(acts[:-1], rews[:-1])
        ]
        transitions.append(
            data.Transition(None, acts[-1], rews[-1], True, None, {}))
        transition_batch = data.nested_stack(transitions)
        episodes.append(data.Episode(transition_batch, sum(rews)))
    return episodes
Exemple #6
0
    def batched_request(self):
        """Batches requests and returns batched request."""
        if self._batched_request is not None:
            return self._batched_request

        data.nested_map(_PredictionRequestBatcher._assert_not_scalar,
                        self._requests)

        # Stack instead of concatenate to ensure that all requests have
        # the same shape.
        batched_request_content = data.nested_stack(
            [request.content for request in self._requests])
        # (n_agents, n_requests, ...) -> (n_agents * n_requests, ...)
        batched_request_content = data.nested_map(
            _PredictionRequestBatcher._flatten_first_2_dims,
            batched_request_content)
        self._batched_request = Request(self._request_type,
                                        batched_request_content)
        return self._batched_request
    def __call__(self, observation, model):
        del observation

        init_state = model.clone_state()

        def step_and_rewind(action):
            (observation, reward, done, _) = model.step(action)
            model.restore_state(init_state)
            return (observation, reward, done)

        (observations, rewards, dones) = data.nested_stack([
            step_and_rewind(action)
            for action in space_utils.element_iter(model.action_space)
        ])
        # Run the network to predict values for children.
        values = yield observations
        # (batch_size, 1) -> (batch_size,)
        values = np.reshape(values, -1)
        # Compute the final qualities, masking out the "done" states.
        child_qualities = list(rewards + self._discount * values * (1 - dones))
        prior = _uniform_prior(len(child_qualities))
        return list(zip(child_qualities, prior))
Exemple #8
0
    def solve(self, env, epoch=None, init_state=None, time_limit=None):
        """Solves a given environment using OnlineAgent.act().

        Args:
            env (gym.Env): Environment to solve.
            epoch (int): Current training epoch or None if no training.
            init_state (object): Reset the environment to this state.
                If None, then do normal gym.Env.reset().
            time_limit (int or None): Maximum number of steps to make on the
                solved environment. None means no time limit.

        Yields:
            Network-dependent: A stream of Network inputs requested for
            inference.

        Returns:
            data.Episode: Episode object containing a batch of collected
            transitions and the return for the episode.
        """
        yield from super().solve(env, epoch, init_state, time_limit)

        self._epoch = epoch

        model_env = env

        if time_limit is not None:
            # Add the TimeLimitWrapper _after_ passing the model env to the
            # agent, so the states cloned/restored by the agent do not contain
            # the number of steps made so far - this would break state lookup
            # in some Agents.
            env = envs.TimeLimitWrapper(env, time_limit)

        if init_state is None:
            # Model-free case...
            full_observation = env.reset()
            observation = np.concatenate([
                full_observation['observation'],
                full_observation['desired_goal']
            ],
                                         axis=-1)
        else:
            # Model-based case...
            observation = env.restore_state(init_state)
        # print('init observation', observation)

        yield from self.reset(model_env, observation)
        #for x in self.reset(model_env, observation):
        ##print(x)
        #yield np.concatenate([x['observation'], x['desired_goal']], axis=-1)

        transitions = []
        done = False
        info = {}
        places = {tuple(observation.flatten())}
        while not done:
            # Forward network prediction requests to BatchStepper.
            # print("solving...")
            #print(observation)
            (action, agent_info) = yield from self.act(observation)
            # print("has action!")
            # TODO
            (full_next_observation, reward, done, info) = env.step(action)
            next_observation = np.concatenate([
                full_next_observation['observation'],
                full_next_observation['desired_goal']
            ],
                                              axis=-1)
            places.add(tuple(next_observation.flatten()))

            transitions.append(
                data.Transition(
                    observation=full_observation,
                    action=action,
                    reward=reward,
                    done=done,
                    next_observation=full_next_observation,
                    agent_info=agent_info,
                ))
            full_observation = full_next_observation
            observation = next_observation

        return_ = sum(transition.reward for transition in transitions)
        transitions = self.postprocess_transitions(transitions)

        solved = info['solved'] if 'solved' in info else None
        truncated = (info['TimeLimit.truncated']
                     if 'TimeLimit.truncated' in info else None)
        transition_batch = data.nested_stack(transitions)

        info = {'move_diversity': len(places)}
        # neptune_logger('move diversity', len(places))
        # sys.exit(0)
        return data.Episode(
            transition_batch=transition_batch,
            return_=return_,
            solved=solved,
            truncated=truncated,
            info=info,
        )
Exemple #9
0
 def predict(self, inputs):
     result = [network.predict(inputs) for network in self._networks]
     stacked_result = data.nested_stack(result, axis=-1)
     return stacked_result
Exemple #10
0
    def solve(self, env, epoch=None, init_state=None, time_limit=None):
        """Solves a given environment using OnlineAgent.act().

        Args:
            env (gym.Env): Environment to solve.
            epoch (int): Current training epoch or None if no training.
            init_state (object): Reset the environment to this state.
                If None, then do normal gym.Env.reset().
            time_limit (int or None): Maximum number of steps to make on the
                solved environment. None means no time limit.

        Yields:
            Network-dependent: A stream of Network inputs requested for
            inference.

        Returns:
            data.Episode: Episode object containing a batch of collected
            transitions and the return for the episode.
        """
        yield from super().solve(env, epoch, init_state, time_limit)

        self._epoch = epoch

        model_env = env

        if time_limit is not None:
            # Add the TimeLimitWrapper _after_ passing the model env to the
            # agent, so the states cloned/restored by the agent do not contain
            # the number of steps made so far - this would break state lookup
            # in some Agents.
            env = envs.TimeLimitWrapper(env, time_limit)

        if init_state is None:
            # Model-free case...
            observation = env.reset()
        else:
            # Model-based case...
            observation = env.restore_state(init_state)

        yield from self.reset(model_env, observation)

        for callback in self._callbacks:
            callback.on_episode_begin(env, observation, epoch)

        transitions = []
        done = False
        info = {}
        while not done:
            # Forward network prediction requests to BatchStepper.
            (action, agent_info) = yield from self.act(observation)
            (next_observation, reward, done, info) = env.step(action)

            for callback in self._callbacks:
                callback.on_real_step(agent_info, action, next_observation,
                                      reward, done)

            transitions.append(
                data.Transition(
                    observation=observation,
                    action=action,
                    reward=reward,
                    done=done,
                    next_observation=next_observation,
                    agent_info=agent_info,
                ))
            observation = next_observation

        for callback in self._callbacks:
            callback.on_episode_end()

        transitions = self.postprocess_transitions(transitions)

        return_ = sum(transition.reward for transition in transitions)
        solved = info['solved'] if 'solved' in info else None
        truncated = (info['TimeLimit.truncated']
                     if 'TimeLimit.truncated' in info else None)
        transition_batch = data.nested_stack(transitions)
        additional_info = info[
            'additional_info'] if 'additional_info' in info else None
        return data.Episode(transition_batch=transition_batch,
                            return_=return_,
                            solved=solved,
                            truncated=truncated,
                            additional_info=additional_info)
Exemple #11
0
    def solve(self, env, epoch=None, init_state=None, time_limit=None):
        yield from super().solve(env, epoch, init_state, time_limit)

        self._epoch = epoch

        model_env = env

        if time_limit is not None:
            env = envs.TimeLimitWrapper(env, time_limit)

        if init_state is None:

            observation = env.reset()
        else:

            observation = env.restore_state(init_state)

        yield from self.reset(model_env, observation)

        for callback in self._callbacks:
            callback.on_episode_begin(env, observation, epoch)

        transitions = []
        done = False
        info = {}
        while not done:

            (action, agent_info) = yield from self.act(observation)
            (next_observation, reward, done, info) = env.step(action)

            for callback in self._callbacks:
                callback.on_real_step(agent_info, action, next_observation,
                                      reward, done)

            transitions.append(
                data.Transition(
                    observation=observation,
                    action=action,
                    reward=reward,
                    done=done,
                    next_observation=next_observation,
                    agent_info=agent_info,
                ))
            observation = next_observation

        for callback in self._callbacks:
            callback.on_episode_end()

        transitions = self.postprocess_transitions(transitions)

        return_ = sum(transition.reward for transition in transitions)
        solved = info['solved'] if 'solved' in info else None
        truncated = (info['TimeLimit.truncated']
                     if 'TimeLimit.truncated' in info else None)
        transition_batch = data.nested_stack(transitions)
        action_space_size = space.max_size(model_env.action_space)
        return data.Episode(transition_batch=transition_batch,
                            return_=return_,
                            solved=solved,
                            truncated=truncated,
                            action_space_size=action_space_size)
Exemple #12
0
from alpacka.data import nested_stack
from silence_tensorflow import silence_tensorflow
silence_tensorflow()
import gin
import numpy as np
from own_testing.one_side_splendor.fifo_input import xxx

from splendor.envs.mechanics.state import State
from splendor.networks.architectures.average_pooling_network import splendor_state_evaluator, GemsEncoder, PriceEncoder, \
    ManyCardsEncoder
from splendor.networks.utils.vectorizer import Vectorizer

gin.parse_config_file(
    '/home/tomasz/ML_Research/alpha_splendor/own_testing/one_side_splendor/network_params.gin'
)

x = splendor_state_evaluator(None, )
x.compile(loss='mse')

vectorizer = Vectorizer()
obs = [vectorizer.state_to_input(State()) for _ in range(15)]
z = nested_stack(obs)
o = x.predict_on_batch(z)
#print(o)
for tt in z:
    print(tt.shape)

print(len(z))

print(z)