Beispiel #1
0
    def __init__(self,
                 env,
                 task,
                 rollout_length,
                 episode_summary_freq,
                 env_render_freq,
                 ep_summary,
                 test=False,
                 policy=None,
                 data_sample_config=None,
                 memory_config=None,
                 test_conditions=None,
                 slowdown_steps=0,
                 global_step_op=None,
                 aux_render_modes=None,
                 _implemented_aux_render_modes=None,
                 name='synchro',
                 log_level=WARNING,
                 **kwargs):
        """

        Args:
            env:                            BTgym environment instance
            task:                           int, runner task id
            rollout_length:                 int
            episode_summary_freq:           int
            env_render_freq:                int
            test:                           legacy, not used
            ep_summary:                     legacy, not used
            policy:                         policy instance to execute
            data_sample_config:             dict, data sampling configuration dictionary
            memory_config:                  dict, replay memory configuration
            test_conditions:                dict or None,
                                            dictionary of single experience conditions to check to mark it as test one.
            slowdown_time:                  time to sleep between steps
            aux_render_modes:               iterable of str, additional summaries to compute
            _implemented_aux_render_modes   iterable of str, implemented additional summaries
            name:                           str, name scope
            log_level:                      int, logbook.level
        """
        self.env = env
        self.task = task
        self.name = name
        self.rollout_length = rollout_length
        self.episode_summary_freq = episode_summary_freq
        self.env_render_freq = env_render_freq

        self.memory_config = memory_config
        self.policy = policy
        self.data_sample_config = data_sample_config

        self.log_level = log_level
        StreamHandler(sys.stdout).push_application()
        self.log = Logger('{}_Runner_{}'.format(self.name, self.task),
                          level=self.log_level)

        # Aux rendering setup:
        if _implemented_aux_render_modes is None:
            self.implemented_aux_render_modes = []

        else:
            self.implemented_aux_render_modes = _implemented_aux_render_modes

        self.aux_render_modes = []
        if aux_render_modes is not None:
            for mode in aux_render_modes:
                if mode in self.implemented_aux_render_modes:
                    self.aux_render_modes.append(mode)

                else:
                    msg = 'Render mode `{}` is not implemented.'.format(mode)
                    self.log.error(msg)
                    raise NotImplementedError(msg)

        self.log.debug('self.render modes: {}'.format(self.aux_render_modes))

        self.sess = None
        self.summary_writer = None

        self.global_step_op = global_step_op

        if self.task == 0 and slowdown_steps > 0 and self.global_step_op is not None:
            self.log.notice(
                'is slowed down by {} global_iterations/step'.format(
                    slowdown_steps))
            self.slowdown_steps = slowdown_steps

        else:
            self.slowdown_steps = 0

        if test_conditions is None:
            # Default test conditions are: experience comes from test episode, from target domain:
            self.test_conditions = {
                'state': {
                    'metadata': {
                        'type': 1,
                        'trial_type': 1
                    }
                }
            }
        else:
            self.test_conditions = test_conditions

        # Make replay memory:
        if self.memory_config is not None:
            self.memory = self.memory_config['class_ref'](
                **self.memory_config['kwargs'])

        else:
            self.memory = _DummyMemory()

        self.length = 0
        self.local_episode = 0
        self.reward_sum = 0

        self.terminal_end = True

        # Summary averages accumulators:
        self.total_r = []
        self.cpu_time = []
        self.final_value = []
        self.total_steps = []
        self.total_steps_atari = []
        self.info = [None]
        self.pre_experience = None
        self.state = None
        self.context = None

        self.last_action = None
        self.last_reward = None

        # Episode accumulators:
        self.ep_accum = None

        self.log.debug('__init__() done.')
Beispiel #2
0
    def __init__(
        self,
        env,
        task,
        rollout_length,
        episode_summary_freq,
        env_render_freq,
        ep_summary,
        test=False,
        policy=None,
        data_sample_config=None,
        memory_config=None,
        aux_summaries=('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'),
        name='synchro',
        log_level=WARNING,
    ):
        """

        Args:
            env:                    BTgym environment instance
            task:                   int, runner task id
            rollout_length:         int
            episode_summary_freq:   int
            env_render_freq:        int
            test:                   not used
            ep_summary:             not used
            policy:                 policy instance to execute
            data_sample_config:     dict, data sampling configuration dictionary
            memory_config:          dict, replay memory configuration
            aux_summaries:          iterable of str, additional summaries to compute
            name:                   str, name scope
            log_level:              int, logbook.level
        """
        self.env = env
        self.task = task
        self.name = name
        self.rollout_length = rollout_length
        self.episode_summary_freq = episode_summary_freq
        self.env_render_freq = env_render_freq

        self.memory_config = memory_config
        self.policy = policy
        self.data_sample_config = data_sample_config
        self.aux_summaries = aux_summaries
        self.log_level = log_level
        StreamHandler(sys.stdout).push_application()
        self.log = Logger('{}_Runner_{}'.format(self.name, self.task),
                          level=self.log_level)
        self.sess = None
        self.summary_writer = None

        # Make replay memory:
        if self.memory_config is not None:
            self.memory = self.memory_config['class_ref'](
                **self.memory_config['kwargs'])

        else:
            self.memory = _DummyMemory()

        self.length = 0
        self.local_episode = 0
        self.reward_sum = 0

        self.terminal_end = True

        # Summary averages accumulators:
        self.total_r = []
        self.cpu_time = []
        self.final_value = []
        self.total_steps = []
        self.total_steps_atari = []
        self.info = None
        self.pre_experience = None
        self.state = None
        self.context = None
        self.action_reward = None

        # Episode accumulators:
        self.ep_accum = None

        self.norm_image = lambda x: np.round((x - x.min()) / np.ptp(x) * 255)

        self.log.debug('__init__() done.')
Beispiel #3
0
def env_runner(sess, env, policy, task, rollout_length, summary_writer,
               episode_summary_freq, env_render_freq, atari_test, ep_summary,
               memory_config):
    """
    The logic of the thread runner.
    In brief, it constantly keeps on running
    the policy, and as long as the rollout exceeds a certain length, the thread
    runner appends all the collected data to the queue.

    Args:
        env:                    environment instance
        policy:                 policy instance
        task:                   int
        rollout_length:         int
        episode_summary_freq:   int
        env_render_freq:        int
        atari_test:             bool, Atari or BTGyn
        ep_summary:             dict of tf.summary op and placeholders
        memory_config:          replay memory configuration dictionary

    Yelds:
        collected data as dictionary of on_policy, off_policy rollouts and episode statistics.
    """
    if memory_config is not None:
        memory = memory_config['class_ref'](**memory_config['kwargs'])

    else:
        memory = _DummyMemory()
    # Pass sample config to environment:
    last_state = env.reset(**policy.get_sample_config())
    last_context = policy.get_initial_features(state=last_state)
    length = 0
    local_episode = 0
    reward_sum = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate(
        [last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = []
    cpu_time = []
    final_value = []
    total_steps = []
    total_steps_atari = []

    ep_stat = None
    test_ep_stat = None
    render_stat = None

    while True:
        terminal_end = False
        rollout = Rollout()

        action, value_, context = policy.act(last_state, last_context,
                                             last_action_reward)

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())

        # Partially collect first experience of rollout:
        last_experience = {
            'position': {
                'episode': local_episode,
                'step': length
            },
            'state': last_state,
            'action': action,
            'reward': reward,
            'value': value_,
            'terminal': terminal,
            'context': last_context,
            'last_action_reward': last_action_reward,
        }
        # Execute user-defined callbacks to policy, if any:
        for key, callback in policy.callback.items():
            last_experience[key] = callback(**locals())

        length += 1
        reward_sum += reward
        last_state = state
        last_context = context
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate(
            [last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, rollout_length):
            if not terminal:
                # Continue adding experiences to rollout:
                action, value_, context = policy.act(last_state, last_context,
                                                     last_action_reward)

                # Argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())
                #if not atari_test:
                #        state = state['model_input']

                # Partially collect next experience:
                experience = {
                    'position': {
                        'episode': local_episode,
                        'step': length
                    },
                    'state': last_state,
                    'action': action,
                    'reward': reward,
                    'value': value_,
                    'terminal': terminal,
                    'context': last_context,
                    'last_action_reward': last_action_reward,
                    #'pixel_change': 0 #policy.get_pc_target(state, last_state),
                }
                for key, callback in policy.callback.items():
                    experience[key] = callback(**locals())

                # Bootstrap to complete and push previous experience:
                last_experience['r'] = value_
                rollout.add(last_experience)
                memory.add(last_experience)

                # Housekeeping:
                length += 1
                reward_sum += reward
                last_state = state
                last_context = context
                last_action = action
                last_reward = reward
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                # All environment-specific summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += [reward_sum]
                total_steps_atari += [length]
                if not atari_test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[-1]  # pull most recent info
                    cpu_time += [episode_stat['runtime'].total_seconds()]
                    final_value += [last_i['broker_value']]
                    total_steps += [episode_stat['length']]
                #print('last_episode.metadata:', state['metadata'])

                # Episode statistics:
                try:
                    # Was it test episode ( `type` in metadata is not zero)?
                    if not atari_test and state['metadata']['type']:
                        is_test_episode = True

                    else:
                        is_test_episode = False

                except KeyError:
                    is_test_episode = False

                if is_test_episode:
                    #print(task, total_r)
                    test_ep_stat = dict(total_r=total_r[-1],
                                        final_value=final_value[-1],
                                        steps=total_steps[-1])
                else:
                    if local_episode % episode_summary_freq == 0:
                        if not atari_test:
                            # BTgym:
                            ep_stat = dict(total_r=np.average(total_r),
                                           cpu_time=np.average(cpu_time),
                                           final_value=np.average(final_value),
                                           steps=np.average(total_steps))
                        else:
                            # Atari:
                            ep_stat = dict(total_r=np.average(total_r),
                                           steps=np.average(total_steps_atari))
                        total_r = []
                        cpu_time = []
                        final_value = []
                        total_steps = []
                        total_steps_atari = []

                if task == 0 and local_episode % env_render_freq == 0:
                    if not atari_test:
                        # Render environment (chief worker only, and not in atari atari_test mode):
                        render_stat = {
                            mode: env.render(mode)[None, :]
                            for mode in env.render_modes
                        }
                    else:
                        # Atari:
                        render_stat = dict(
                            render_atari=state['external'][None, :] * 255)

                # New episode:
                last_state = env.reset(**policy.get_sample_config())
                last_context = policy.get_initial_features(
                    state=last_state, context=last_context)
                length = 0
                reward_sum = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0
                last_action_reward = np.concatenate(
                    [last_action, np.asarray([last_reward])], axis=-1)

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `rollout_length` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            # Bootstrap:
            last_experience['r'] = np.asarray([
                policy.get_value(last_state, last_context, last_action_reward)
            ])

        else:
            last_experience['r'] = np.asarray([0.0])

        rollout.add(last_experience)

        # Only training rollouts are added to replay memory:
        try:
            # Was it test (`type` in metadata is not zero)?
            if not atari_test and last_experience['state']['metadata']['type']:
                is_test = True

            else:
                is_test = False

        except KeyError:
            is_test = False

        if not is_test:
            memory.add(last_experience)

        #print('last_experience {}'.format(last_experience['position']))
        #for k, v in last_experience.items():
        #    try:
        #        print(k, 'shape: ', v.shape)
        #    except:
        #        try:
        #            print(k, 'type: ', type(v), 'len: ', len(v))
        #        except:
        #            print(k, 'type: ', type(v), 'value: ', v)

        #print('rollout_step: {}, last_exp/frame_pos: {}\nr: {}, v: {}, v_next: {}, t: {}'.
        #    format(
        #        length,
        #        last_experience['position'],
        #        last_experience['reward'],
        #        last_experience['value'],
        #        last_experience['value_next'],
        #        last_experience['terminal']
        #    )
        #)
        #print('rollout size: {}, last r: {}'.format(len(rollout.position), rollout.r[-1]))
        #print('last value_next: ', last_experience['value_next'], ', rollout flushed.')

        # Once we have enough experience and memory can be sampled, yield it,
        # and have the ThreadRunner place it on a queue:
        if memory.is_full():
            data = dict(
                on_policy=rollout,
                off_policy=memory.sample_uniform(sequence_size=rollout_length),
                off_policy_rp=memory.sample_priority(exact_size=True),
                ep_summary=ep_stat,
                test_ep_summary=test_ep_stat,
                render_summary=render_stat,
            )
            yield data

            ep_stat = None
            test_ep_stat = None
            render_stat = None
Beispiel #4
0
def VerboseEnvRunnerFn(
        sess,
        env,
        policy,
        task,
        rollout_length,
        summary_writer,
        episode_summary_freq,
        env_render_freq,
        atari_test,
        ep_summary,
        memory_config,
        log,
        aux_summaries=('action_prob', 'value_fn', 'lstm_1_h', 'lstm_2_h'),
):
    """
    More verbose function for runtime logic of the thread runner.
    Extends per-episode summaries with visualiation of: actions porbabilities distribution, value function,
    hidden LSTM state. In it's default configuration supposed to be used with stacked_LSTM architecture.


    Args:
        env:                    environment instance
        policy:                 policy instance
        task:                   int
        rollout_length:         int
        episode_summary_freq:   int
        env_render_freq:        int
        atari_test:             bool, Atari or BTGyn
        ep_summary:             dict of tf.summary op and placeholders
        memory_config:          replay memory configuration dictionary
        log:                    logbook logger
        aux_summaries:          list of str, additional summaries to compute

    Yelds:
        collected data as dictionary of on_policy, off_policy rollouts, episode statistics and summaries.
    """
    if memory_config is not None:
        memory = memory_config['class_ref'](**memory_config['kwargs'])

    else:
        memory = _DummyMemory()
    # Pass sample config to environment:
    last_state = env.reset(**policy.get_sample_config())
    last_context = policy.get_initial_features(state=last_state)
    length = 0
    local_episode = 0
    reward_sum = 0
    last_action = np.zeros(env.action_space.n)
    last_action[0] = 1
    last_reward = 0.0
    last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1)

    # Summary averages accumulators:
    total_r = []
    cpu_time = []
    final_value = []
    total_steps = []
    total_steps_atari = []

    # Aux accumulators:
    ep_a_logits = []
    ep_value = []
    ep_context = []

    ep_stat = None
    test_ep_stat = None
    render_stat = None

    norm_image = lambda x: np.round((x - x.min()) / np.ptp(x) * 255)

    if env.data_master is True:
        # Hacky but we need env.renderer methods ready
        env.renderer.initialize_pyplot()



    while True:
        terminal_end = False
        rollout = Rollout()

        action, logits, value_, context = policy.act(last_state, last_context, last_action_reward)

        ep_a_logits.append(logits)
        ep_value.append(value_)
        ep_context.append(context)

        #log.debug('*: A: {}, V: {}, step: {} '.format(action, value_, length))

        # argmax to convert from one-hot:
        state, reward, terminal, info = env.step(action.argmax())

        # Partially collect first experience of rollout:
        last_experience = {
            'position': {'episode': local_episode, 'step': length},
            'state': last_state,
            'action': action,
            'reward': reward,
            'value': value_,
            'terminal': terminal,
            'context': last_context,
            'last_action_reward': last_action_reward,
        }
        # Execute user-defined callbacks to policy, if any:
        for key, callback in policy.callback.items():
            last_experience[key] = callback(**locals())

        length += 1
        reward_sum += reward
        last_state = state
        last_context = context
        last_action = action
        last_reward = reward
        last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1)

        for roll_step in range(1, rollout_length):
            if not terminal:
                # Continue adding experiences to rollout:
                action, logits, value_, context = policy.act(last_state, last_context, last_action_reward)

                #log.debug('A: {}, V: {}, step: {} '.format(action, value_, length))

                ep_a_logits.append(logits)
                ep_value.append(value_)
                ep_context.append(context)

                #log.notice('context: {}'.format(context))

                # Argmax to convert from one-hot:
                state, reward, terminal, info = env.step(action.argmax())

                # Partially collect next experience:
                experience = {
                    'position': {'episode': local_episode, 'step': length},
                    'state': last_state,
                    'action': action,
                    'reward': reward,
                    'value': value_,
                    'terminal': terminal,
                    'context': last_context,
                    'last_action_reward': last_action_reward,
                    #'pixel_change': 0 #policy.get_pc_target(state, last_state),
                }
                for key, callback in policy.callback.items():
                    experience[key] = callback(**locals())

                # Bootstrap to complete and push previous experience:
                last_experience['r'] = value_
                rollout.add(last_experience)
                memory.add(last_experience)

                # Housekeeping:
                length += 1
                reward_sum += reward
                last_state = state
                last_context = context
                last_action = action
                last_reward = reward
                last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1)
                last_experience = experience

            if terminal:
                # Finished episode within last taken step:
                terminal_end = True
                # All environment-specific summaries are here due to fact
                # only runner allowed to interact with environment:
                # Accumulate values for averaging:
                total_r += [reward_sum]
                total_steps_atari += [length]
                if not atari_test:
                    episode_stat = env.get_stat()  # get episode statistic
                    last_i = info[-1]  # pull most recent info
                    cpu_time += [episode_stat['runtime'].total_seconds()]
                    final_value += [last_i['broker_value']]
                    total_steps += [episode_stat['length']]

                # Episode statistics:
                try:
                    # Was it test episode ( `type` in metadata is not zero)?
                    if not atari_test and state['metadata']['type']:
                        is_test_episode = True

                    else:
                        is_test_episode = False

                except KeyError:
                    is_test_episode = False

                if is_test_episode:
                    test_ep_stat = dict(
                        total_r=total_r[-1],
                        final_value=final_value[-1],
                        steps=total_steps[-1]
                    )
                else:
                    if local_episode % episode_summary_freq == 0:
                        if not atari_test:
                            # BTgym:
                            ep_stat = dict(
                                total_r=np.average(total_r),
                                cpu_time=np.average(cpu_time),
                                final_value=np.average(final_value),
                                steps=np.average(total_steps)
                            )
                        else:
                            # Atari:
                            ep_stat = dict(
                                total_r=np.average(total_r),
                                steps=np.average(total_steps_atari)
                            )
                        total_r = []
                        cpu_time = []
                        final_value = []
                        total_steps = []
                        total_steps_atari = []

                if task == 0 and local_episode % env_render_freq == 0 :
                    if not atari_test:
                        # Render environment (chief worker only, and not in atari atari_test mode):
                        render_stat = {
                            mode: env.render(mode)[None,:] for mode in env.render_modes
                        }
                        # Update renderings with aux:

                        # log.notice('ep_logits shape: {}'.format(np.asarray(ep_a_logits).shape))
                        # log.notice('ep_value shape: {}'.format(np.asarray(ep_value).shape))

                        # Unpack LSTM states:
                        rnn_1, rnn_2 = zip(*ep_context)
                        rnn_1 = [state[0] for state in rnn_1]
                        rnn_2 = [state[0] for state in rnn_2]
                        c1, h1 = zip(*rnn_1)
                        c2, h2 = zip(*rnn_2)

                        aux_images = {
                            'action_prob':  env.renderer.draw_plot(
                                # data=softmax(np.asarray(ep_a_logits)[:, 0, :] - np.asarray(ep_a_logits).max()),
                                data=softmax(np.asarray(ep_a_logits)[:, 0, :]),
                                title='Episode actions probabilities',
                                figsize=(12, 4),
                                box_text='',
                                xlabel='Backward env. steps',
                                ylabel='R+',
                                line_labels=['Hold', 'Buy', 'Sell', 'Close']
                            )[None, ...],
                            'value_fn': env.renderer.draw_plot(
                                data=np.asarray(ep_value),
                                title='Episode Value function',
                                figsize=(12, 4),
                                xlabel='Backward env. steps',
                                ylabel='R',
                                line_labels = ['Value']
                            )[None, ...],
                            #'lstm_1_c': norm_image(np.asarray(c1).T[None, :, 0, :, None]),
                            'lstm_1_h': norm_image(np.asarray(h1).T[None, :, 0, :, None]),
                            #'lstm_2_c': norm_image(np.asarray(c2).T[None, :, 0, :, None]),
                            'lstm_2_h': norm_image(np.asarray(h2).T[None, :, 0, :, None])
                        }

                        render_stat.update(aux_images)

                    else:
                        # Atari:
                        render_stat = dict(render_atari=state['external'][None,:] * 255)

                # New episode:
                last_state = env.reset(**policy.get_sample_config())
                last_context = policy.get_initial_features(state=last_state, context=last_context)
                length = 0
                reward_sum = 0
                last_action = np.zeros(env.action_space.n)
                last_action[0] = 1
                last_reward = 0.0
                last_action_reward = np.concatenate([last_action, np.asarray([last_reward])], axis=-1)

                # reset per-episode accumulators:
                ep_a_logits = []
                ep_value = []
                ep_context = []

                # Increment global and local episode counts:
                sess.run(policy.inc_episode)
                local_episode += 1
                break

        # After rolling `rollout_length` or less (if got `terminal`)
        # complete final experience of the rollout:
        if not terminal_end:
            # Bootstrap:
            last_experience['r'] = np.asarray(
                [policy.get_value(last_state, last_context, last_action_reward)]
            )

        else:
            last_experience['r'] = np.asarray([0.0])

        rollout.add(last_experience)

        # Only training rollouts are added to replay memory:
        try:
            # Was it test (`type` in metadata is not zero)?
            if not atari_test and last_experience['state']['metadata']['type']:
                is_test = True

            else:
                is_test = False

        except KeyError:
            is_test = False

        if not is_test:
            memory.add(last_experience)

        # Once we have enough experience and memory can be sampled, yield it,
        # and have the ThreadRunner place it on a queue:
        if memory.is_full():
            data = dict(
                on_policy=rollout,
                off_policy=memory.sample_uniform(sequence_size=rollout_length),
                off_policy_rp=memory.sample_priority(exact_size=True),
                ep_summary=ep_stat,
                test_ep_summary=test_ep_stat,
                render_summary=render_stat,
            )
            yield data

            ep_stat = None
            test_ep_stat = None
            render_stat = None