Beispiel #1
0
    def test_agent(self):
        self.start_tests(name='getting-started-agent')

        environment = Environment.create(environment='gym',
                                         level='CartPole',
                                         max_episode_timesteps=50)
        self.finished_test()

        agent = Agent.create(agent='tensorforce',
                             environment=environment,
                             update=64,
                             optimizer=dict(optimizer='adam',
                                            learning_rate=1e-3),
                             objective='policy_gradient',
                             reward_estimation=dict(horizon=20))
        self.finished_test()

        agent = Agent.create(agent='ppo',
                             environment=environment,
                             batch_size=10,
                             learning_rate=1e-3)
        self.finished_test()

        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)
        self.finished_test()
Beispiel #2
0
    def test_load_performance(self):
        self.start_tests(name='load-performance')

        environment = Environment.create(environment='CartPole-v1')

        agent = Agent.load(
            directory='test/data', filename='ppo-checkpoint', format='checkpoint',
            environment=environment
        )
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards))
        runner.close()
        agent.close()
        self.finished_test()

        agent = Agent.load(
            directory='test/data', filename='ppo-checkpoint', format='numpy',
            environment=environment
        )
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards))
        runner.close()
        agent.close()
        self.finished_test()

        agent = Agent.load(
            directory='test/data', filename='ppo-checkpoint', format='hdf5',
            environment=environment
        )
        runner = Runner(agent=agent, environment=environment)
        runner.run(num_episodes=10, use_tqdm=False, evaluation=True)
        self.assertTrue(all(episode_reward == 500.0 for episode_reward in runner.episode_rewards))
        runner.close()
        agent.close()
        self.finished_test()

        agent = tf.saved_model.load(export_dir='test/data/ppo-checkpoint')

        # 10 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            episode_reward = 0.0
            while not terminal:
                states = np.expand_dims(states, axis=0)
                auxiliaries = dict(mask=np.ones(shape=(1, 2), dtype=bool))
                actions = agent.act(states, auxiliaries, True)
                actions = actions.numpy().item()
                states, terminal, reward = environment.execute(actions=actions)
                episode_reward += reward
            self.assertEqual(episode_reward, 500.0)

        environment.close()
        self.finished_test()
    def create_agent(
        self,
        env,
        n_episodes,
        save_frenquency,
        load=False,
    ):
        ########### WORK NEEDED ###########
        ### You need to tweak the Agent ###
        ###################################
        """
        Agent definition. Tweak the Agent's parameters to your convenience

        Use any agent from tensorforce and refer to the documentation for the available hyperparameters :
        -Vanilla Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/vpg.html
        -Proximal Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/ppo.html
        -Trust-Region Policy Optimization : https://tensorforce.readthedocs.io/en/latest/agents/trpo.html
        -Deterministic Policy Gradient : https://tensorforce.readthedocs.io/en/latest/agents/dpg.html
        -Deep Q-Network : https://tensorforce.readthedocs.io/en/latest/agents/dqn.html
        -Double DQN : https://tensorforce.readthedocs.io/en/latest/agents/double_dqn.html
        -Dueling DQN : https://tensorforce.readthedocs.io/en/latest/agents/dueling_dqn.html
        -Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/ac.html
        -Advantage Actor-Critic : https://tensorforce.readthedocs.io/en/latest/agents/a2c.html

        For the network parameters :
        https://tensorforce.readthedocs.io/en/latest/modules/networks.html


        """
        ##### Agent definition ########
        if not (load):
            agent = Agent.create(
                agent="ppo",
                batch_size=10,
                exploration=0.01,
                learning_rate=0.00001,
                likelihood_ratio_clipping=0.1,
                # etc...,
                saver=dict(
                    directory="data/checkpoints",
                    frequency=10,  # save checkpoint every 10 updates
                ),  # don't change this
                environment=env,
            )

        else:
            agent = Agent.load(directory="data/checkpoints")
        return agent
    def test_vpg(self):
        self.start_tests(name='VPG')
        agent, environment = self.prepare(agent='vpg',
                                          batch_size=2,
                                          network=dict(type='auto',
                                                       size=8,
                                                       depth=1,
                                                       rnn=2),
                                          baseline=dict(type='auto',
                                                        size=7,
                                                        depth=1,
                                                        rnn=1),
                                          baseline_optimizer=dict(
                                              optimizer='adam',
                                              learning_rate=1e-3))

        self.execute(agent=agent, environment=environment)

        with TemporaryDirectory() as directory:
            agent.save(directory=directory, format='numpy')
            agent = Agent.load(directory=directory)
            states = environment.reset()
            agent.act(states=states)
            agent.close()
            environment.close()
    def test_dpg(self):
        self.start_tests(name='DPG')
        actions = dict(gaussian_action1=dict(type='float',
                                             shape=(1, 2),
                                             min_value=1.0,
                                             max_value=2.0),
                       gaussian_action2=dict(type='float',
                                             shape=(1, ),
                                             min_value=-2.0,
                                             max_value=1.0))
        agent, environment = self.prepare(
            actions=actions,
            agent='dpg',
            memory=100,
            batch_size=4,
            # TODO: no-RNN restriction can be removed
            network=dict(type='auto', size=8, depth=1, rnn=False),
            # TODO: cannot use RNN since value function takes states and actions
            critic=dict(type='auto', size=7, depth=1, rnn=False))

        self.execute(agent=agent, environment=environment)

        with TemporaryDirectory() as directory:
            agent.save(directory=directory, format='numpy')
            agent = Agent.load(directory=directory)
            states = environment.reset()
            agent.act(states=states)
            agent.close()
            environment.close()
def main():
    # Record experience traces
    record_ppo_config(directory='ppo-traces')
    # Alternatively:
    # record_custom_act_function(directory='ppo-traces')
    # write_custom_recording_file(directory='ppo-traces')

    # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
    # experience of one episode to the agent and subsequently perform one update
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)
    agent.pretrain(directory='ppo-traces',
                   num_iterations=30,
                   num_traces=1,
                   num_updates=1)

    # Evaluate the pretrained agent
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()

    # Close agent and environment
    agent.close()
    environment.close()
Beispiel #7
0
 def setup(self, dbars: Any) -> Any:
     trainingEnvironment = Environment.create(
         environment=TradingEnvironment(dbars),
     )
     self.agent = Agent.create(
         agent=PPOAgent,
         environment=trainingEnvironment,  # alternatively: states, actions, (max_episode_timesteps)
         update=dict(
             unit='timesteps', 
             batch_size=64
         ),
         network="auto",
         ## exploration=?,
         reward_estimation=dict(
             horizon=20
             # discount=?,
         ),
         learning_rate=3e-4,
         # likelihood_ratio_clipping=?,
         # subsampling_fraction=?,
         # multi_step=?
         summarizer=dict(
             directory='./tensorboard/'
         )
     )
     self.agent.save(directory='model-numpy', format='checkpoint', append='episodes')
     ## Train!
     runner = Runner(self.agent, environment=trainingEnvironment)
     runner.run(
         num_episodes=10000,
         save_best_agent='./best-agent/'
     )
     trainingEnvironment.close()
     ## Prepare agent for trading
     self.internal_state = self.agent.initial_internals()
    def createRLagent(self, load=None):
        states_dict = {'type': 'float', 'shape': self.num_states}
        actions_dict = {
            'type': 'float',
            'shape': self.num_actions,
            'min_value': self.input_low,
            'max_value': self.input_high
        }

        agent = Agent.create(
            agent='tensorforce',
            states=
            states_dict,  # alternatively: states, actions, (max_episode_timesteps)
            actions=actions_dict,
            memory=10000,
            update=dict(unit='timesteps', batch_size=64),
            max_episode_timesteps=self.len_episode,
            optimizer=dict(type='adam', learning_rate=3e-4),
            policy=dict(network='auto'),
            objective='policy_gradient',
            reward_estimation=dict(horizon=20))

        if not load == None:
            agent.restore(directory=load)

        return agent
def main():
    # Start recording traces after the first 100 episodes -- by then, the agent
    # has solved the environment
    runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json',
                               recorder=dict(directory='ppo-traces',
                                             start=80)),
                    environment='benchmarks/configs/cartpole.json')
    runner.run(num_episodes=100)
    runner.close()

    # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
    # experience of one episode to the agent and subsequently perform one update
    environment = Environment.create(
        environment='benchmarks/configs/cartpole.json')
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment)
    agent.pretrain(directory='ppo-traces',
                   num_iterations=30,
                   num_traces=1,
                   num_updates=1)

    # Evaluate the pretrained agent
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()

    # Close agent and environment
    agent.close()
    environment.close()
 def __init__(self, m: int, n: int, breach_level: float, delta_t: float,
              learning_rate: float, timeout_time: int, save_path: str,
              model: TensorForceModel) -> None:
     super(OurProximalPolicyAgent, self).__init__(m, n)
     self._breach_level = breach_level
     self._delta_t = delta_t
     self.timeout_time = timeout_time
     self._save_path = save_path
     self._model = model
     self._ppo_agent: TensorForceAgent = TensorForceAgent.create(
         agent=OurProximalPolicyAgent._SPECIFICATION_KEY,
         states={
             'type': 'float',
             'shape': (self._m + self._n, ),
             'min_value': 0.0,
             'max_value': self._breach_level + self._delta_t
         },
         actions={
             'type': 'int',
             'shape': (self._m + self._n, ),
             'num_values': OurProximalPolicyAgent._NUM_ACTIONS
         },
         max_episode_timesteps=self.timeout_time,
         batch_size=OurProximalPolicyAgent._BATCH_SIZE,
         learning_rate=learning_rate,
         network=self._model,
         saver=None if not OurProximalPolicyAgent._SAVE else {
             'directory': self._save_path,
             'filename': OurProximalPolicyAgent._SAVE_NAME,
             'frequency': OurProximalPolicyAgent._SAVING_FREQUENCY
         })
Beispiel #11
0
    def test_quickstart(self):
        self.start_tests(name='quickstart')

        # ====================

        # Create an OpenAI-Gym environment
        environment = Environment.create(environment='gym',
                                         level='CartPole-v1')

        # Create a PPO agent
        agent = Agent.create(
            agent='ppo',
            environment=environment,
            # Automatically configured network
            network='auto',
            # Optimization
            batch_size=10,
            update_frequency=2,
            learning_rate=1e-3,
            subsampling_fraction=0.2,
            optimization_steps=5,
            # Reward estimation
            likelihood_ratio_clipping=0.2,
            discount=0.99,
            estimate_terminal=False,
            # Critic
            critic_network='auto',
            critic_optimizer=dict(optimizer='adam',
                                  multi_step=10,
                                  learning_rate=1e-3),
            # Preprocessing
            preprocessing=None,
            # Exploration
            exploration=0.0,
            variable_noise=0.0,
            # Regularization
            l2_regularization=0.0,
            entropy_regularization=0.0,
            # TensorFlow etc
            name='agent',
            device=None,
            parallel_interactions=1,
            seed=None,
            execution=None,
            saver=None,
            summarizer=None,
            recorder=None)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Start the runner
        runner.run(num_episodes=50)
        runner.close()

        # ====================

        self.finished_test()
def set_up(
    time_limit=100,
    batch_size=16,
    env_version=1,
    seed_count=10,
    max_count=10,
):
    """
    Set up a rushed CartSeed agent with less time than it needs to complete an episode.
    Parameters
    ----------
    time_limit : int, None
        Turn time limit for episode
    batch_size : int
        Batch size for training
    env_version : int in {1, 2}
        Environment version. 1 being ideal time, 2 being time limited
    seed_count : int
        Number of bad seeds
    max_count: int
            Maximum number of samples/scans needed to saturate a bad_seed

    Returns
    -------
    Environment
    Agent
    """
    def default_score(state, *args):
        return 1

    if env_version == 1:
        environment = CartSeed(
            seed_count=seed_count,
            bad_seed_count=None,
            max_count=max_count,
            sequential=True,
            revisiting=True,
            bad_seed_reward_f=default_score,
            measurement_time=time_limit,
        )
    elif env_version == 2:
        environment = CartSeedCountdown(
            seed_count=seed_count,
            bad_seed_count=None,
            max_count=max_count,
            sequential=True,
            revisiting=True,
            bad_seed_reward_f=default_score,
            measurement_time=time_limit,
        )
    else:
        raise NotImplementedError
    env = Environment.create(environment=environment)
    agent = Agent.create(agent="a2c", batch_size=batch_size, environment=env)

    return env, agent
Beispiel #13
0
def main():
    # OpenAI-Gym environment initialization
    environment = Environment.create(environment='benchmarks/configs/cartpole.json')

    # PPO agent initialization
    agent = Agent.create(
        agent='benchmarks/configs/ppo.json', environment=environment,
        # Option 1: Saver - save agent periodically every 10 updates
        # and keep the 5 most recent checkpoints
        saver=dict(directory='model-checkpoint', frequency=10, max_checkpoints=5),
    )

    # Runner initialization
    runner = Runner(agent=agent, environment=environment)

    # Training
    runner.run(num_episodes=100)
    runner.close()

    # Option 2: Explicit save
    # (format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model,
    # agent argument saver, specified above, uses 'checkpoint')
    agent.save(directory='model-numpy', format='numpy', append='episodes')

    # Close agent separately, since created separately
    agent.close()

    # Load agent TensorFlow checkpoint
    agent = Agent.load(directory='model-checkpoint', format='checkpoint', environment=environment)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()
    agent.close()

    # Load agent NumPy weights
    agent = Agent.load(directory='model-numpy', format='numpy', environment=environment)
    runner = Runner(agent=agent, environment=environment)
    runner.run(num_episodes=100, evaluation=True)
    runner.close()
    agent.close()

    # Close environment separately, since created separately
    environment.close()
Beispiel #14
0
    def prepare(
            self,
            # general environment
            environment=None,
            max_episode_timesteps=None,
            # unit-test environment
            min_timesteps=None,
            states=None,
            actions=None,
            # exclude action types
            exclude_bool_action=False,
            exclude_int_action=False,
            exclude_float_action=False,
            exclude_bounded_action=False,
            # agent
            require_observe=False,
            require_all=False,
            **agent):
        """
        Generic unit-test preparation.
        """
        Layer.layers = None

        if environment is None:
            environment = self.environment_spec(
                max_episode_timesteps=max_episode_timesteps,
                min_timesteps=min_timesteps,
                states=states,
                actions=actions,
                exclude_bool_action=exclude_bool_action,
                exclude_int_action=exclude_int_action,
                exclude_float_action=exclude_float_action,
                exclude_bounded_action=exclude_bounded_action)
            environment = Environment.create(environment=environment)

        elif min_timesteps is None:
            if max_episode_timesteps is None:
                max_episode_timesteps = self.__class__.max_episode_timesteps

            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps)

        else:
            raise TensorforceError.unexpected()

        agent = self.agent_spec(require_observe=require_observe,
                                require_all=require_all,
                                **agent)

        agent = Agent.create(agent=agent, environment=environment)

        return agent, environment
def main():
    num_parallel = 8
    environment = Environment.create(environment='custom_cartpole',
                                     max_episode_timesteps=500)
    agent = Agent.create(agent='benchmarks/configs/ppo.json',
                         environment=environment,
                         parallel_interactions=num_parallel)

    # Train for 100 episodes
    for episode in range(0, 100, num_parallel):

        # Episode using act and observe
        parallel, states = environment.reset(num_parallel=num_parallel)
        terminal = (parallel < 0)  # all false
        sum_rewards = 0.0
        num_updates = 0
        while not terminal.all():
            actions = agent.act(states=states, parallel=parallel)
            next_parallel, states, terminal, reward = environment.execute(
                actions=actions)
            num_updates += agent.observe(terminal=terminal,
                                         reward=reward,
                                         parallel=parallel)
            parallel = next_parallel
            sum_rewards += reward.sum()
        print('Episode {}: return={} updates={}'.format(
            episode, sum_rewards / num_parallel, num_updates))

    # Evaluate for 100 episodes
    num_parallel = 4
    num_episodes = 100
    sum_rewards = 0.0
    for _ in range(0, num_episodes, num_parallel):
        parallel, states = environment.reset(num_parallel=num_parallel)
        internals = agent.initial_internals()
        internals = [internals for _ in range(num_parallel)]
        terminal = (parallel < 0)  # all false
        while not terminal.all():
            actions, internals = agent.act(states=states,
                                           internals=internals,
                                           independent=True,
                                           deterministic=True)
            _, states, terminal, reward = environment.execute(actions=actions)
            internals = [
                internal for internal, term in zip(internals, terminal)
                if not term
            ]
            sum_rewards += reward.sum()
    print('Mean evaluation return:', sum_rewards / num_episodes)

    # Close agent and environment
    agent.close()
    environment.close()
    def test_readme(self):
        self.start_tests(name='readme')

        environment = UnittestEnvironment(
            states=dict(type='float', shape=(10,)),
            actions=dict(type='int', shape=(), num_values=5),
            min_timesteps=5
        )

        def get_current_state():
            return environment.reset()

        def execute_decision(x):
            return environment.execute(actions=x)[2]

        # ==========

        from tensorforce import Agent

        # Instantiate a Tensorforce agent
        agent = Agent.create(
            agent='tensorforce',
            states=dict(type='float', shape=(10,)),
            actions=dict(type='int', num_values=5),
            max_episode_timesteps=100,
            memory=10000,
            update=dict(unit='timesteps', batch_size=64),
            optimizer=dict(type='adam', learning_rate=3e-4),
            policy=dict(network='auto'),
            objective='policy_gradient',
            reward_estimation=dict(horizon=20)
        )

        # Retrieve the latest (observable) environment state
        state = get_current_state()  # (float array of shape [10])

        # Query the agent for its action decision
        action = agent.act(states=state)  # (scalar between 0 and 4)

        # Execute the decision and retrieve the current performance score
        reward = execute_decision(action)  # (any scalar float)

        # Pass feedback about performance (and termination) to the agent
        agent.observe(reward=reward, terminal=False)

        # ==========

        agent.close()
        environment.close()
        self.finished_test()
Beispiel #17
0
    def test_record_and_pretrain(self):
        self.start_tests(name='record-and-pretrain')

        with TemporaryDirectory() as directory:

            # ====================

            # Start recording traces after the first 100 episodes -- by then, the agent
            # has solved the environment
            runner = Runner(agent=dict(agent='benchmarks/configs/ppo.json',
                                       recorder=dict(directory=directory,
                                                     start=8)),
                            environment='benchmarks/configs/cartpole.json')
            runner.run(num_episodes=10)
            runner.close()

            # Pretrain a new agent on the recorded traces: for 30 iterations, feed the
            # experience of one episode to the agent and subsequently perform one update
            environment = Environment.create(
                environment='benchmarks/configs/cartpole.json')
            agent = Agent.create(agent='benchmarks/configs/ppo.json',
                                 environment=environment)
            agent.pretrain(directory='test/data/ppo-traces',
                           num_iterations=30,
                           num_traces=1,
                           num_updates=1)

            # Evaluate the pretrained agent
            runner = Runner(agent=agent, environment=environment)
            runner.run(num_episodes=10, evaluation=True)
            self.assertTrue(
                all(episode_reward == 500.0
                    for episode_reward in runner.episode_rewards))
            runner.close()

            # Close agent and environment
            agent.close()
            environment.close()

            # ====================

            files = sorted(os.listdir(path=directory))
            self.assertEqual(len(files), 2)
            self.assertTrue(
                all(
                    file.startswith('trace-')
                    and file.endswith('0000000{}.npz'.format(n))
                    for n, file in enumerate(files, start=8)))

        self.finished_test()
Beispiel #18
0
    def test_act_observe(self):
        self.start_tests(name='act-observe')

        # ====================

        environment = Environment.create(
            environment='benchmarks/configs/cartpole.json')
        agent = Agent.create(agent='benchmarks/configs/ppo.json',
                             environment=environment)

        # Train for 100 episodes
        for episode in range(10):

            # Episode using act and observe
            states = environment.reset()
            terminal = False
            sum_reward = 0.0
            num_updates = 0
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                num_updates += agent.observe(terminal=terminal, reward=reward)
                sum_reward += reward
            print('Episode {}: return={} updates={}'.format(
                episode, sum_reward, num_updates))

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(10):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               independent=True,
                                               deterministic=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward
        print('Mean evaluation return:', sum_rewards / 100.0)

        # Close agent and environment
        agent.close()
        environment.close()

        # ====================

        self.finished_test()
    def test_execution(self):
        self.start_tests(name='getting-started-execution')

        runner = Runner(agent='test/data/agent.json',
                        environment=dict(environment='gym', level='CartPole'),
                        max_episode_timesteps=10)
        runner.run(num_episodes=10)
        runner.run(num_episodes=5, evaluation=True)
        runner.close()
        self.finished_test()

        # Create agent and environment
        environment = Environment.create(
            environment='test/data/environment.json', max_episode_timesteps=10)
        agent = Agent.create(agent='test/data/agent.json',
                             environment=environment)

        # Train for 200 episodes
        for _ in range(10):
            states = environment.reset()
            terminal = False
            while not terminal:
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        # Evaluate for 100 episodes
        sum_rewards = 0.0
        for _ in range(5):
            states = environment.reset()
            internals = agent.initial_internals()
            terminal = False
            while not terminal:
                actions, internals = agent.act(states=states,
                                               internals=internals,
                                               evaluation=True)
                states, terminal, reward = environment.execute(actions=actions)
                sum_rewards += reward

        sum_rewards / 100

        # Close agent and environment
        agent.close()
        environment.close()

        self.finished_test()
Beispiel #20
0
    def test_masking(self):
        # FEATURES.MD
        self.start_tests(name='masking')

        agent = Agent.create(agent=self.agent_spec(
            states=dict(type='float', shape=(10, )),
            actions=dict(type='int', shape=(), num_values=3)))

        states = dict(
            state=np.random.random_sample(
                size=(10, )),  # state (default name: "state")
            action_mask=[
                True, False, True
            ]  # mask as'[ACTION-NAME]_mask' (default name: "action")
        )
        action = agent.act(states=states)
        assert action != 1
Beispiel #21
0
def create_agent(param_grid, i, directory, environment):
    return Agent.create(
        agent="ppo",
        environment=environment,
        # Automatically configured network
        network=dict(
            type=param_grid["network"],
            size=param_grid["size"],
            depth=param_grid["depth"],
        ),
        # Optimization
        batch_size=param_grid["batch_size"],
        update_frequency=param_grid["update_frequency"],
        learning_rate=param_grid["learning_rate"],
        subsampling_fraction=param_grid["subsampling_fraction"],
        optimization_steps=param_grid["optimization_steps"],
        # Reward estimation
        likelihood_ratio_clipping=param_grid["likelihood_ratio_clipping"],
        discount=param_grid["discount"],
        estimate_terminal=param_grid["estimate_terminal"],
        # Critic
        critic_network="auto",
        critic_optimizer=dict(
            optimizer="adam",
            multi_step=param_grid["multi_step"],
            learning_rate=param_grid["learning_rate_critic"],
        ),
        # Preprocessing
        preprocessing=None,
        # Exploration
        exploration=param_grid["exploration"],
        variable_noise=param_grid["variable_noise"],
        # Regularization
        l2_regularization=param_grid["l2_regularization"],
        entropy_regularization=param_grid["entropy_regularization"],
        # TensorFlow etc
        name="agent_" + str(i),
        device=None,
        parallel_interactions=5,
        seed=124,
        execution=None,
        recorder=dict(directory=directory, frequency=1000),
        summarizer=None,
        saver=dict(directory=directory, filename="agent_" + str(i)),
    )
Beispiel #22
0
    def createRLagent(self, load):
        states_dict = {'type': 'float', 'shape': self.num_states}
        actions_dict = {
            'type': 'float',
            'shape': self.num_actions,
            'min_value': self.input_low,
            'max_value': self.input_high
        }

        return Agent.create(
            agent='dqn',
            states=
            states_dict,  # alternatively: states, actions, (max_episode_timesteps)
            actions=actions_dict,
            memory=10000,
            exploration=0.75,
            max_episode_timesteps=self.len_episode,
        )
Beispiel #23
0
    def prepare(self, environment=None, states=None, actions=None, **agent):
        """
        Generic unit-test preparation.
        """
        if environment is None:
            environment = self.environment_spec(states=states, actions=actions)
            environment = Environment.create(environment=environment)

        else:
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=self.__class__.max_episode_timesteps)

        agent = self.agent_spec(**agent)

        agent = Agent.create(agent=agent, environment=environment)

        return agent, environment
 def __init__(self, m: int, n: int, breach_level: float, delta_t: float,
              learning_rate: float, use_gradient_clipping: bool,
              save_path: str, model: TensorForceModel) -> None:
     super(OurTensorForceAgent, self).__init__(m, n)
     self._breach_level = breach_level
     self._delta_t = delta_t
     self._save_path = save_path
     self._model = model
     self._tensor_force_agent: TensorForceAgent = TensorForceAgent.create(
         agent=OurTensorForceAgent._SPECIFICATION_KEY,
         states={
             'type': 'float',
             'shape': (self._m + self._n, ),
             'min_value': 0.0,
             'max_value': self._breach_level + self._delta_t
         },
         actions={
             'type': 'int',
             'shape': (self._m + self._n, ),
             'num_values': OurTensorForceAgent._NUM_ACTIONS
         },
         memory=OurTensorForceAgent._MEMORY,
         update={
             'unit': 'timesteps',
             'batch_size': OurTensorForceAgent._BATCH_SIZE
         },
         optimizer={
             'type':
             OurTensorForceAgent._OPTIMIZER_NAME,
             'learning_rate':
             learning_rate,
             'clipnorm':
             OurTensorForceAgent._OPTIMIZER_GRADIENT_CLIP_THRESHOLD
             if use_gradient_clipping else None
         },
         policy=self._model,
         objective=OurTensorForceAgent._OBJECTIVE,
         exploration=OurTensorForceAgent._EXPLORATION_RATE,
         reward_estimation={'horizon': OurTensorForceAgent._REWARD_HORIZON},
         saver=None if not OurTensorForceAgent._SAVE else {
             'directory': self._save_path,
             'file_name': OurTensorForceAgent._SAVE_NAME,
             'frequency': OurTensorForceAgent._SAVING_FREQUENCY
         })
Beispiel #25
0
    def test_readme(self):
        self.start_tests(name='readme')

        # ====================

        from tensorforce import Agent, Environment

        # Pre-defined or custom environment
        environment = Environment.create(environment='gym',
                                         level='CartPole',
                                         max_episode_timesteps=500)

        # Instantiate a Tensorforce agent
        agent = Agent.create(
            agent='tensorforce',
            environment=
            environment,  # alternatively: states, actions, (max_episode_timesteps)
            memory=1000,
            update=dict(unit='timesteps', batch_size=64),
            optimizer=dict(type='adam', learning_rate=3e-4),
            policy=dict(network='auto'),
            objective='policy_gradient',
            reward_estimation=dict(horizon=20))

        # Train for 300 episodes
        for _ in range(1):

            # Initialize episode
            states = environment.reset()
            terminal = False

            while not terminal:
                # Episode timestep
                actions = agent.act(states=states)
                states, terminal, reward = environment.execute(actions=actions)
                agent.observe(terminal=terminal, reward=reward)

        agent.close()
        environment.close()

        # ====================

        self.finished_test()
Beispiel #26
0
def main():
    # Setup
    interactive = 0
    size = 4
    brd = Board(size, graphics=0)
    rand_ag = RandomAgent()

    if interactive == 1:
        brd.start_interactive()

    agent = Agent.create(agent='tensorforce',
                         environment=Board,
                         update=64,
                         objective='policy_gradient',
                         reward_estimation=dict(horizon=20))

    runner = Runner(agent=agent, environment=Board, max_episode_timesteps=500)

    runner.run(num_episodes=200)
Beispiel #27
0
def training_example(num_episodes: int, max_episode_timesteps: int):
    # Instantiate the environment (run the CARLA simulator before doing this!)
    env = CARLAEnvironment(debug=True)

    # Create your own agent (here is just an example)
    agent = Agent.create(agent='ppo',
                         environment=env,
                         max_episode_timesteps=max_episode_timesteps,
                         batch_size=1)

    # Training loop (you couldn't use a Runner instead)
    # `weights_dir` and `record_dir` are `None` to prevent saving and recording
    env.train(agent=agent,
              num_episodes=num_episodes,
              max_episode_timesteps=max_episode_timesteps,
              weights_dir=None,
              record_dir=None)

    pygame.quit()
Beispiel #28
0
    def createRLagent(self, load=None):
        states_dict = {'type': 'float', 'shape': self.inputSize}

        if self.binary: outType = 'bool'
        else: outType = 'float'

        actions_dict = {'type': 'bool', 'shape': 1}

        agent = Agent.create(agent='dqn',
                             states=states_dict,
                             actions=actions_dict,
                             max_episode_timesteps=1,
                             exploration=0.05,
                             memory=10000)

        if not load == None:
            agent.restore(directory=load)

        return agent
Beispiel #29
0
    def initialize_agent(self):
        # Set up information about the boost pads now that the game is active and the info is available
        self.boost_pad_tracker.initialize_boosts(self.get_field_info())
        if MODEL is not None:
            max_time = 10
            frames_per_sec = 20
            max_timesteps = RLEnvironment.get_max_timesteps(max_time, frames_per_sec)
            self.env = Environment.create(
                environment=KickoffEnvironment,
                max_episode_timesteps=max_timesteps,
                max_time=max_time,
                message_throttle=20,
                frames_per_sec=frames_per_sec,
                input_exclude=[
                    InputOptions.BALL_POSITION_REL,
                    InputOptions.BALL_DIRECTION,
                    InputOptions.CAR_POSITION_REL,
                    InputOptions.CAR_VELOCITY_MAG,
                ],
                output_exclude=[
                    OutputOptions.BOOST,
                    OutputOptions.STEER,
                    OutputOptions.E_BRAKE,
                    OutputOptions.THROTTLE,
                    OutputOptions.ROLL,
                ]
            )

            directory='../learning/training/{0}'.format(MODEL)
            filename='agent'
            agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json') 

            if not os.path.isfile(agent):
                logging_utils.log_warn(os.getcwd(), {})
                raise Exception('Model file doesn\'t exist')
            
            self.agent = Agent.load(
                directory=os.path.abspath(directory),
                environment=self.env,
                format='checkpoint',
            )
            self.env.reset()
Beispiel #30
0
    def prepare(self, environment=None, states=None, actions=None, **agent):
        """
        Generic unit-test preparation.
        """
        if environment is None:
            environment = self.environment_spec(states=states, actions=actions)
            environment = Environment.create(environment=environment)

        else:
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=self.__class__.max_episode_timesteps)

        agent = self.agent_spec(**agent)

        agent = Agent.create(agent=agent, environment=environment)
        assert agent.__class__.__name__ in ('ConstantAgent', 'RandomAgent') or \
            isinstance(agent.model.get_architecture(), str)

        return agent, environment