Ejemplo n.º 1
0
    def __init__(
        self, agent, environment, max_episode_timesteps=None, evaluation_environment=None,
        save_best_agent=None
    ):
        self.is_environment_external = isinstance(environment, Environment)
        self.environment = Environment.create(
            environment=environment, max_episode_timesteps=max_episode_timesteps
        )

        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.is_eval_environment_external = isinstance(evaluation_environment, Environment)
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment, max_episode_timesteps=max_episode_timesteps
            )
            assert self.evaluation_environment.states() == self.environment.states()
            assert self.evaluation_environment.actions() == self.environment.actions()

        self.is_agent_external = isinstance(agent, Agent)
        self.agent = Agent.create(agent=agent, environment=self.environment)
        self.save_best_agent = save_best_agent

        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
Ejemplo n.º 2
0
    def __init__(self,
                 agent,
                 environment,
                 evaluation_environment=None,
                 save_best_agent=False):
        # save_best overwrites saver...
        self.is_environment_external = isinstance(environment, Environment)
        self.environment = Environment.create(environment=environment)

        self.is_eval_environment_external = isinstance(evaluation_environment,
                                                       Environment)
        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment)

        self.save_best_agent = save_best_agent
        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict()
        if self.save_best_agent is True:
            # Disable periodic saving
            assert not self.is_agent_external
            kwargs = dict(saver=dict(seconds=None, steps=None))
        self.agent = Agent.create(agent=agent,
                                  environment=self.environment,
                                  **kwargs)

        # self.global_episodes = self.agent.episodes
        # self.global_timesteps = self.agent.timesteps
        # self.global_updates = self.agent.updates
        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
Ejemplo n.º 3
0
def test_bad_initialization():
    with pytest.raises(ValueError):
        Environment.create(
            environment=Bollux,
            seed_count=3,
            bad_seed_count=10,
            max_episode_length=100,
        )
Ejemplo n.º 4
0
    def __init__(self,
                 agent,
                 environments,
                 evaluation_environment=None,
                 save_best_agent=False):
        # save_best overwrites saver...
        if not util.is_iterable(x=environments):
            raise TensorforceError.type(name='parallel-runner',
                                        argument='environments',
                                        value=environments)
        elif len(environments) == 0:
            raise TensorforceError.value(name='parallel-runner',
                                         argument='environments',
                                         value=environments)

        self.is_environment_external = tuple(
            isinstance(environment, Environment)
            for environment in environments)
        self.environments = tuple(
            Environment.create(environment=environment)
            for environment in environments)

        self.is_eval_environment_external = isinstance(evaluation_environment,
                                                       Environment)
        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment)

        self.save_best_agent = save_best_agent
        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict(parallel_interactions=len(environments))
        # warning: save_best_agent
        if not self.is_agent_external and self.save_best_agent:
            # Disable periodic saving
            kwargs = dict(saver=dict(seconds=None, steps=None))
        self.agent = Agent.create(agent=agent,
                                  environment=self.environments[0],
                                  **kwargs)
        if not self.agent.model.is_initialized:
            self.agent.initialize()

        # self.global_episodes = self.agent.episodes
        # self.global_timesteps = self.agent.timesteps
        # self.global_updates = self.agent.updates
        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        self.evaluation_rewards = list()
        self.evaluation_timesteps = list()
        self.evaluation_seconds = list()
        self.evaluation_agent_seconds = list()
Ejemplo n.º 5
0
def set_up():
    tensorflow_settings()
    bad_seeds_environment = Environment.create(
        environment=BadSeeds02,
        seed_count=10,
        bad_seed_count=3,
        history_block=2,
        max_episode_timesteps=500,
    )

    agent = Agent.create(
        agent="dqn",
        network=[
            dict(type='flatten'),
            dict(type='dense', size=32, activation='tanh'),
            dict(type='dense', size=32, activation='tanh')
        ],
        environment=bad_seeds_environment,
        batch_size=256,
        memory=int(10**7),
        exploration=0.15,
        summarizer=dict(
            directory="training_data/agent_02_env_02/summaries",
            labels="all",
            frequency=100  # store values every 100 timesteps
        ))

    return bad_seeds_environment, agent
Ejemplo n.º 6
0
def set_up():
    tensorflow_settings()
    env = Environment.create(environment=CartSeed01,
                             seed_count=10,
                             bad_seed_count=3,
                             max_count=20)

    agent = Agent.create(
        agent="a2c",
        batch_size=10000,
        horizon=50,
        discount=0.97,
        l2_regularization=0.1,
        variable_noise=0.5,
        environment=env,
        summarizer=dict(
            directory="training_data/a2c_cartseed/summaries",
            labels="all",
            frequency=10,
        ),
        # saver=dict(
        #     directory='saved_models/agent_04_env_04_1000/checkpoints',
        #     frequency=600  # save checkpoint every 600 seconds (10 minutes)
        # ),
    )
    return env, agent
    def __init__(self,
                 environment: 'TradingEnvironment',
                 agent_spec: any,
                 save_best_agent: bool = False,
                 **kwargs):
        """
        Arguments:
            environment: A `TradingEnvironment` instance for the agent to trade within.
            agent: A `Tensorforce` agent or agent specification.
            save_best_agent (optional): The runner will automatically save the best agent
            kwargs (optional): Optional keyword arguments to adjust the strategy.
        """
        self._max_episode_timesteps = kwargs.get('max_episode_timesteps',
                                                 False)

        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._agent = Agent.create(agent=agent_spec,
                                   environment=self._environment)

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=save_best_agent)
Ejemplo n.º 8
0
def runEnv():
    environment = Environment.create(
        environment=CustomEnvironment, max_episode_timesteps=500
    )
    agent = Agent.create(agent='a2c', environment=environment, batch_size=10, learning_rate=1e-3)

    # Train for 200 episodes
    for _ in range(2000):
        states = environment.reset()
        terminal = False
        while CustomEnvironment.extraCounter != 100:
            actions = agent.act(states=states)
            # print(actions)
            # print(states)
            states, reward, terminal = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)

    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(1000):
        states = environment.reset()
        internals = agent.initial_internals()
        terminal = False
        while CustomEnvironment.extraCounter != 100:
            actions, internals = agent.act(states=states, internals=internals, independent=True)
            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward

    # print('Mean episode reward:', sum_rewards / 100)
    # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount)
    print(CustomEnvironment.sum)

    # Close agent and environment
    agent.close()
    environment.close()
Ejemplo n.º 9
0
def main():
    # Create an OpenAI-Gym environment
    environment = Environment.create(environment='gym', level='CartPole-v1')

    # Create a PPO agent
    agent = Agent.create(
        agent='dqn',
        environment=environment,
        # memory=100,
        # # Optimization
        # batch_size=10, update_frequency=2, learning_rate=1e-3,
        summarizer=dict(
            directory='data/summaries',
            # list of labels, or 'all'
            labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'],
            frequency=100  # store values every 100 timesteps
            # (infrequent update summaries every update; other configurations possible)
        ),
        recorder=None)

    # Initialize the runner
    runner = Runner(agent=agent, environment=environment)

    # Start the runner
    runner.run(num_episodes=10000)
    runner.close()
Ejemplo n.º 10
0
    def prepare(self,
                environment=None,
                timestep_range=None,
                states=None,
                actions=None,
                exclude_bool_action=False,
                exclude_int_action=False,
                exclude_float_action=False,
                exclude_bounded_action=False,
                require_observe=False,
                require_all=False,
                **agent):
        """
        Generic unit-test preparation.
        """
        Layer.layers = None

        if environment is None:
            if states is None:
                states = deepcopy(self.__class__.states)

            if actions is None:
                actions = deepcopy(self.__class__.actions)
                if exclude_bool_action or self.__class__.exclude_bool_action:
                    actions.pop('bool_action')
                if exclude_int_action or self.__class__.exclude_int_action:
                    actions.pop('int_action')
                if exclude_float_action or self.__class__.exclude_float_action:
                    actions.pop('float_action')
                if exclude_bounded_action or self.__class__.exclude_bounded_action:
                    actions.pop('bounded_action')

            if timestep_range is None:
                timestep_range = self.__class__.timestep_range

            environment = UnittestEnvironment(states=states,
                                              actions=actions,
                                              timestep_range=timestep_range)

        elif timestep_range is not None:
            raise TensorforceError.unexpected()

        environment = Environment.create(environment=environment)

        for key, value in self.__class__.agent.items():
            if key not in agent:
                agent[key] = value

        if self.__class__.require_all or require_all:
            config = None
        elif self.__class__.require_observe or require_observe:
            config = dict(api_functions=['reset', 'act', 'observe'])
        else:
            config = dict(api_functions=['reset', 'act'])

        agent = Agent.create(agent=agent,
                             environment=environment,
                             config=config)

        return agent, environment
    def __init__(self, environment: 'TradingEnvironment', agent_spec: any,
                 **kwargs):
        """
        Arguments:
            environment: A `TradingEnvironment` instance for the agent to trade within.
            agent: A `Tensorforce` agent or agent specification.
            save_best_agent (optional): The runner will automatically save the best agent
            kwargs (optional): Optional keyword arguments to adjust the strategy.
        """
        self._max_episode_timesteps = kwargs.get('max_episode_timesteps',
                                                 False)
        self._save_best_agent = kwargs.get('save_best_agent', False)

        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._agent = Agent.create(
            agent=agent_spec,
            environment=self._environment,
            summarizer=dict(
                directory='data/summaries',
                labels=['graph', 'losses',
                        'rewards'],  # list of labels, or 'all'
                frequency=100  # store values every 100 timesteps
                # (infrequent update summaries every update; other configurations possible)
            ),
        )

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=self._save_best_agent)
Ejemplo n.º 12
0
 def __init__(self, in_dim, n_action, rl, train):
     super().__init__()
     self.make_in_port('observation', in_dim)
     self.make_in_port('reward', 1)
     self.make_in_port('done', 1)
     self.make_out_port('action', 1)
     self.make_in_port('token_in', 1)
     self.make_out_port('token_out', 1)
     self.n_action = n_action  # number of action choices
     self.results['action'] = np.array([np.random.randint(n_action)])
     self.model = None
     self.env_type = "MotorEnv"
     self.token = 0
     self.prev_actions = 0
     self.init = True
     self.in_dim = in_dim
     self.rl = rl
     if rl:
         self.env = Environment.create(
             environment=MotorComponent.MotorEnv,
             max_episode_timesteps=train["episode_count"] *
             train["max_steps"],
             n_action=n_action,
             obs_dim=in_dim,
             parent=self)
         self.env.reset()
         self.agent = Agent.create(agent=train['rl_agent'],
                                   environment=self.env)
Ejemplo n.º 13
0
def main():

    bad_seeds_environment = Environment.create(environment=BadSeeds03,
                                               seed_count=10,
                                               bad_seed_count=3,
                                               max_episode_length=100)

    agent = Agent.create(
        agent="a2c",
        batch_size=100,  # this seems to help a2c
        horizon=20,  # does this help a2c?
        exploration=0.01,  # tried without this at first
        l2_regularization=0.1,
        entropy_regularization=0.2,
        variable_noise=0.05,
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_01_env_03/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=100000)
    agent.save(directory="saved_models")
def main():
    tensorflow_settings()
    bad_seeds_environment = Environment.create(
        environment=BadSeeds02,
        seed_count=10,
        bad_seed_count=3,
        history_block=2,
        max_episode_timesteps=100,
    )

    agent = Agent.create(
        agent="random",
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_random_env_02/summaries",
            labels="all",
            frequency=100,  # store values every 100 timesteps
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=10000)

    bad_seeds_environment.close()
    agent.close()
Ejemplo n.º 15
0
def set_up():
    tensorflow_settings()
    bad_seeds_environment = Environment.create(
        environment=BadSeedsSkinny,
        seed_count=10,
        bad_seed_count=3,
        history_block=2,
        max_episode_timesteps=100,
    )

    agent = Agent.create(
        agent="a2c",
        network=[
            dict(type='flatten'),
            dict(type='dense', size=32, activation='relu'),
            dict(type='dense', size=32, activation='relu')
        ],
        batch_size=10000,  # changed for 04 but was this a mistake? no
        horizon=50,  # changed from 100 to 50 for agent_04
        discount=0.97,  # new for agent_04
        #exploration=0.05,  # turned off for agent_04 - turn on for 05?
        l2_regularization=0.1,
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.5,  # changed from 0.1 to 0.5 for agent_04
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/a2c_dense_skinny/summaries",
            # list of labels, or 'all'
            labels="all",
            frequency=100,  # store values every 100 timesteps
        ),
    )

    return bad_seeds_environment, agent
Ejemplo n.º 16
0
def main():

    bad_seeds_environment = Environment.create(
        environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100
    )

    agent = Agent.create(
        agent="a2c",
        batch_size=100,
        horizon=100,     # changed from 20 to 100 for agent_03
        exploration=0.05,  # changed from 0.01 to 0.05 for agent_03
        l2_regularization=0.2,  # changed from 0.1 to 0.2 for agent_03
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.1,  # changed from 0.05 to 0.1 for agent_03
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_03_env_03/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
        saver=dict(
            directory='saved_models/agent_03_env_03/checkpoints',
            frequency=600  # save checkpoint every 600 seconds (10 minutes)
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    for _ in range(10):
        runner.run(num_episodes=10000)
        runner.run(num_episodes=1000, evaluation=True)

    bad_seeds_environment.close()
    agent.close()
Ejemplo n.º 17
0
    def test_quickstart(self):
        self.start_tests(name='quickstart')

        # ====================

        # Create an OpenAI-Gym environment
        environment = Environment.create(environment='gym',
                                         level='CartPole-v1')

        # Create a PPO agent
        agent = Agent.create(
            agent='ppo',
            environment=environment,
            # Automatically configured network
            network='auto',
            # Optimization
            batch_size=10,
            update_frequency=2,
            learning_rate=1e-3,
            subsampling_fraction=0.2,
            optimization_steps=5,
            # Reward estimation
            likelihood_ratio_clipping=0.2,
            discount=0.99,
            estimate_terminal=False,
            # Critic
            critic_network='auto',
            critic_optimizer=dict(optimizer='adam',
                                  multi_step=10,
                                  learning_rate=1e-3),
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Start the runner
        runner.run(num_episodes=50, use_tqdm=False)
        runner.close()

        # ====================

        self.finished_test()
    def environment(self, environment: 'TradingEnvironment'):
        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=self._save_best_agent)
Ejemplo n.º 19
0
def test_initialization():
    bad_seeds_01_env = Environment.create(environment=BadSeeds01,
                                          seed_count=10,
                                          bad_seed_count=3,
                                          max_episode_length=100)

    assert bad_seeds_01_env.state.shape == (100, 10)
    assert len(bad_seeds_01_env.bad_seeds) == 3
    assert len(bad_seeds_01_env.good_seeds) == 7
Ejemplo n.º 20
0
 def _create_env(self) -> Environment:
     """Creates a tensorforce Environment encapsulating the underlying gym environment given in self.model_config"""
     self.log_api(
         'Environment.create',
         f'(environment="gym", level={self.model_config.original_env_name})'
     )
     result = Environment.create(environment='gym',
                                 level=self.model_config.gym_env_name)
     return result
def set_up(
    time_limit=100,
    batch_size=16,
    env_version=1,
    seed_count=10,
    max_count=10,
):
    """
    Set up a rushed CartSeed agent with less time than it needs to complete an episode.
    Parameters
    ----------
    time_limit : int, None
        Turn time limit for episode
    batch_size : int
        Batch size for training
    env_version : int in {1, 2}
        Environment version. 1 being ideal time, 2 being time limited
    seed_count : int
        Number of bad seeds
    max_count: int
            Maximum number of samples/scans needed to saturate a bad_seed

    Returns
    -------
    Environment
    Agent
    """
    def default_score(state, *args):
        return 1

    if env_version == 1:
        environment = CartSeed(
            seed_count=seed_count,
            bad_seed_count=None,
            max_count=max_count,
            sequential=True,
            revisiting=True,
            bad_seed_reward_f=default_score,
            measurement_time=time_limit,
        )
    elif env_version == 2:
        environment = CartSeedCountdown(
            seed_count=seed_count,
            bad_seed_count=None,
            max_count=max_count,
            sequential=True,
            revisiting=True,
            bad_seed_reward_f=default_score,
            measurement_time=time_limit,
        )
    else:
        raise NotImplementedError
    env = Environment.create(environment=environment)
    agent = Agent.create(agent="a2c", batch_size=batch_size, environment=env)

    return env, agent
Ejemplo n.º 22
0
def runEnv():
    environment = Environment.create(environment=CustomEnvironment,
                                     max_episode_timesteps=500)
    agent = Agent.create(
        agent='a2c',
        environment=environment,
        batch_size=10,
        learning_rate=1e-3,
        exploration=0.01,  # tried without this at first
        variable_noise=0.05,
        # variable_noise=0.01 bad?
        l2_regularization=0.1,
        entropy_regularization=0.2,
        summarizer=dict(
            directory='data/summaries',
            # list of labels, or 'all'
            labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'],
            frequency=100,  # store values every 100 timesteps
        ))

    # Train for 200 episodes
    for _ in range(CustomEnvironment.trainingEps):
        print("Episode:  ", _)
        states = environment.reset()
        terminal = False
        while CustomEnvironment.extraCounter != CustomEnvironment.trials:
            actions = agent.act(states=states)
            # print(actions)
            # print(states)
            states, reward, terminal = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)
        print("bad seeds: ", CustomEnvironment.badseedsFinal)

    # Evaluate for 100 episodes
    sum_rewards = 0.0
    for _ in range(CustomEnvironment.testingEps):
        print("Episode:  ", _ + CustomEnvironment.trainingEps)
        states = environment.reset()
        internals = agent.initial_internals()
        terminal = False
        while CustomEnvironment.extraCounter != CustomEnvironment.trials:
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           independent=True)
            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward
        print("bad seeds: ", CustomEnvironment.badseedsFinal)
    # print('Mean episode reward:', sum_rewards / 100)
    # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount)
    print(CustomEnvironment.sum)

    # Close agent and environment
    agent.close()
    environment.close()
Ejemplo n.º 23
0
    def test_environment(self):
        self.start_tests(name='getting-started-environment')

        environment = Environment.create(environment='gym',
                                         level='CartPole',
                                         max_episode_timesteps=500)
        self.finished_test()

        environment = Environment.create(environment='gym',
                                         level='CartPole-v1')
        self.finished_test()

        environment = Environment.create(
            environment='test/data/environment.json',
            max_episode_timesteps=500)
        self.finished_test()

        environment = Environment.create(
            environment='test.data.custom_env.CustomEnvironment',
            max_episode_timesteps=10)
        self.finished_test()
Ejemplo n.º 24
0
def main():
    # Create an OpenAI-Gym environment
    environment = Environment.create(environment='gym', level='CartPole-v1')
    network = _create_network_specification((100, ))

    # Create a PPO agent
    agent = Agent.create(agent='dueling_dqn',
                         environment=environment,
                         network=network)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=10000)
    runner.close()
Ejemplo n.º 25
0
    def test_getting_started(self):
        from tensorforce.agents import Agent
        from tensorforce.environments import Environment

        # Setup environment
        # (Tensorforce or custom implementation, ideally using the Environment interface)
        environment = Environment.create(
            environment='test/data/environment.json')

        # Create and initialize agent
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)
        agent.initialize()

        # Reset agent and environment at the beginning of a new episode
        agent.reset()
        states = environment.reset()
        terminal = False

        # Agent-environment interaction training loop
        while not terminal:
            actions = agent.act(states=states)
            states, terminal, reward = environment.execute(actions=actions)
            agent.observe(terminal=terminal, reward=reward)

        # ====================

        # Agent-environment interaction evaluation loop
        while not terminal:
            actions = agent.act(states=states, evaluation=True)
            states, terminal, reward = environment.execute(actions=actions)

        # ====================

        # Close agent and environment
        agent.close()
        environment.close()

        # ====================

        from tensorforce.execution import Runner

        # Tensorforce runner utility
        runner = Runner(agent='test/data/agent.json',
                        environment='test/data/environment.json')

        # Run training
        runner.run(num_episodes=50, use_tqdm=False)

        # Close runner
        runner.close()

        self.finished_test()
Ejemplo n.º 26
0
def main():

    bad_seeds_environment = Environment.create(environment=Bollux,
                                               seed_count=10,
                                               bad_seed_count=3,
                                               max_episode_length=100)

    # 20200820-223031
    # 20200820-233243

    # batch_size 1000 goes not get smarter or dumber
    # batch_size 100 20200821-095410 gets dumber
    # try batch size 10000 !

    agent = Agent.create(
        agent="a2c",
        batch_size=10000,  # changed for 04 but was this a mistake? no
        horizon=50,  # changed from 100 to 50 for agent_04
        discount=0.97,  # new for agent_04
        #exploration=0.05,  # turned off for agent_04 - turn on for 05?
        l2_regularization=0.1,
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.5,  # changed from 0.1 to 0.5 for agent_04
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_04_bollux_1000000/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
        saver=dict(
            directory='saved_models/agent_04_bollux_1000000/checkpoints',
            frequency=6000  # save checkpoint every 6000 seconds (100 minutes)
        ),
    )

    # this is the batch_size = 10000 version
    # I hope it is the last env 04
    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=1000000)
    #for i in range(100):
    #    print("running 10000 episodes")
    #    runner.run(num_episodes=10000)
    #    print("saving the agent")
    #    directory = Path(f"saved_models/agent_04_env_04_1000000/10000_{i}/checkpoints")
    #    if directory.exists():
    #        directory.rmdir()
    #    directory.mkdir(parents=True, exist_ok=True)
    #    agent.save(directory=str(directory), format="numpy")

    bad_seeds_environment.close()
    agent.close()
Ejemplo n.º 27
0
def main():
    from tensorforce.environments import Environment
    from bad_seeds.simple.bad_seeds_02 import BadSeeds02
    from bad_seeds.simple.tf_utils import tensorflow_settings
    tensorflow_settings()
    env = Environment.create(environment=BadSeeds02,
                             seed_count=10,
                             bad_seed_count=3,
                             history_block=2,
                             max_episode_timesteps=300)
    agent = DQN(env.states()['shape'],
                env.environment.seed_count,
                exploration=0.25)
    agent.play(env, 5000, verbose=True)
Ejemplo n.º 28
0
def default_cartseed():
    """
    This environment should lock into place the basics, with 10 seeds, each requiring
    Returns
    -------
    Environment
    """
    env = CartSeed(
        seed_count=10,
        bad_seed_count=3,
        frozen_order=True,
    )
    env = Environment.create(environment=env)
    return env
Ejemplo n.º 29
0
def test_initialization():
    bad_seeds_03_env = Environment.create(environment=BadSeeds03,
                                          seed_count=10,
                                          bad_seed_count=3,
                                          max_episode_length=100)

    assert bad_seeds_03_env.history_array.shape == (100, 10)
    assert bad_seeds_03_env.state.shape == (7, 10)
    assert len(bad_seeds_03_env.bad_seeds) == 3
    assert len(bad_seeds_03_env.good_seeds) == 7

    measurement_count_per_seed, measurement_count = count_measurements(
        bad_seeds_03_env.history_array)
    assert np.all(measurement_count_per_seed == 3 * np.ones((1, 10)))
    # all seeds have been measured
    assert measurement_count == 10
Ejemplo n.º 30
0
    def __init__(self, name='ppo_agent', load_model=None, env=None):
        """Initialization of an agent"""
        self.equity_alive = 0
        self.actions = []
        self.last_action_in_stage = ''
        self.temp_stack = []
        self.name = name
        self.autoplay = True

        self.ppo_agent = None
        self.poker_env = Environment.create(environment=env,
                                            max_episode_timesteps=100)
        self.runner = None

        if load_model:
            self.load(load_model)