Example #1
0
    def __init__(self,actions):
        preprocessing_config = [
            {
                "type": "grayscale"
            }
        ]
        exploration_config = dict(
            type="epsilon_anneal",
            initial_epsilon=0.25,
            final_epsilon=0.01,
            timesteps=1000000
        )

        network_spec = [
            dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'),
            dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'),
            dict(type='flatten'),
            dict(type='dense', size=256, activation='lrelu')
        ]
        self.network_path = "network/"
        self.agent = PPOAgent(
            actions = dict(type='int', num_actions=len(actions)),
            states = dict(type='float', shape=(35, 150, 3)),
            network = network_spec,
            actions_exploration = exploration_config,
            states_preprocessing = preprocessing_config
        )
Example #2
0
    def initialize(self, env):
        from gym import spaces
        from tensorforce.agents import PPOAgent
        self.env = env

        if self.algorithm == "ppo":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_actions': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_actions=env.action_space.n)

            self.agent = PPOAgent(states=dict(
                type='float', shape=env.observation_space.shape),
                                  actions=actions,
                                  network=[
                                      dict(type='dense', size=64),
                                      dict(type='dense', size=64)
                                  ],
                                  batching_capacity=1000,
                                  step_optimizer=dict(type='adam',
                                                      learning_rate=1e-4))

            self.restore_model_if_exists(self.checkpoint)

        return self.agent
    def __init__(self, frame_shape=None, game_inputs=None):

        if frame_shape is None:
            raise SerpentError("A 'frame_shape' tuple kwarg is required...")

        states_spec = {"type": "float", "shape": frame_shape}

        if game_inputs is None:
            raise SerpentError("A 'game_inputs' dict kwarg is required...")

        self.game_inputs = game_inputs
        self.game_inputs_mapping = self._generate_game_inputs_mapping()

        actions_spec = {"type": "int", "num_actions": len(self.game_inputs)}

        network_spec = [
            {"type": "conv2d", "size": 32, "window": 8, "stride": 4},
            {"type": "conv2d", "size": 64, "window": 4, "stride": 2},
            {"type": "conv2d", "size": 64, "window": 3, "stride": 1},
            {"type": "flatten"},
            {"type": "dense", "size": 512}
        ]

        self.agent = PPOAgent(
            states_spec=states_spec,
            actions_spec=actions_spec,
            batched_observe=128,
            scope="ppo",
            summary_spec=None,
            network_spec=network_spec,
            device=None,
            session_config=None,
            saver_spec=None,
            distributed_spec=None,
            discount=0.99,
            variable_noise=None,
            states_preprocessing_spec=None,
            explorations_spec=None,
            reward_preprocessing_spec=None,
            distributions_spec=None,
            entropy_regularization=1e-2,
            batch_size=128,
            keep_last_timestep=True,
            baseline_mode=None,
            baseline=None,
            baseline_optimizer=None,
            gae_lambda=None,
            likelihood_ratio_clipping=None,
            step_optimizer=None,
            optimization_steps=10
        )
    def initialize(self,
                   env,
                   parallel_interactions=1,
                   summarizer=None,
                   saver=None):
        from gym import spaces
        from tensorforce.agents import PPOAgent

        self.env = env

        if self.algorithm == "ppo":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_values': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_values=env.action_space.n)

            self.tf_agent = PPOAgent(
                states=dict(type='float', shape=env.observation_space.shape),
                actions=actions,
                max_episode_timesteps=2000,
                network=[
                    dict(type='dense', size=64),
                    dict(type='dense', size=64)
                ],
                # critic_network=[
                #     dict(type='dense', size=64),
                #     dict(type='dense', size=64)
                # ],
                parallel_interactions=parallel_interactions,
                summarizer=summarizer,
                saver=saver,
                execution={
                    'num_parallel': 64,
                    'type': 'single',
                    'session_config': None,
                    'distributed_spec': None
                },
                batch_size=10)
            # batching_capacity=1000,
            # step_optimizer=dict(type='adam', learning_rate=1e-4))

            return self.tf_agent
        return None
def get_ppo_agent():
    return PPOAgent(
        states=dict(type='float', shape=(5, )),
        actions=dict(type='int', num_actions=2),
        network=[
            dict(type='dense', size=20, activation='tanh'),
            dict(type='dense', size=20, activation='tanh'),
        ],
        #batch_size=256,
        # BatchAgent
        #keep_last_timestep=True,
        # PPOAgent
        step_optimizer=dict(type='adam', learning_rate=1e-3),
        optimization_steps=10,
        # Model
        scope='ppo',
        discount=0.99,
        # DistributionModel
        #distributions_spec=None,
        entropy_regularization=0.01,
        # PGModel
        baseline_mode=None,
        baseline=None,
        baseline_optimizer=None,
        gae_lambda=None,
        # PGLRModel
        likelihood_ratio_clipping=0.2,
        #summary_spec=None,
        #distributed_spec=None,
    )
Example #6
0
    def test_quickstart(self):
        sys.stdout.write('\nQuickstart:\n')
        sys.stdout.flush()

        # Create an OpenAI-Gym environment
        environment = OpenAIGym('CartPole-v1')

        # Create the agent
        agent = PPOAgent(
            states=environment.states(),
            actions=environment.actions(),
            # Automatically configured network
            network='auto',
            # Memory sampling most recent experiences, with a capacity of 2500 timesteps
            # (6100 > [30 batch episodes] * [200 max timesteps per episode])
            memory=6100,
            # Update every 10 episodes, with a batch of 30 episodes
            update_mode=dict(unit='episodes', batch_size=30, frequency=10),
            # PPO optimizer
            step_optimizer=dict(type='adam', learning_rate=1e-3),
            # PPO multi-step optimization: 10 updates, each based on a third of the batch
            subsampling_fraction=0.33,
            optimization_steps=10,
            # MLP baseline
            baseline_mode='states',
            baseline=dict(type='network', network='auto'),
            # Baseline optimizer
            baseline_optimizer=dict(type='multi_step',
                                    optimizer=dict(type='adam',
                                                   learning_rate=1e-4),
                                    num_steps=5),
            # Other parameters
            discount=0.99,
            entropy_regularization=1e-2,
            gae_lambda=None,
            likelihood_ratio_clipping=0.2)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Function handle called after each finished episode
        def callback(r):
            return float(np.mean(r.episode_rewards[-100:])) <= 180.0

        # Start the runner
        runner.run(num_episodes=1000,
                   max_episode_timesteps=200,
                   callback=callback)
        runner.close()

        if float(np.mean(runner.episode_rewards[-100:])) <= 180.0:
            sys.stdout.write('Test failed, exceeding {} episodes\n'.format(
                runner.episode))
            sys.stdout.flush()
            self.assertTrue(expr=False)
        else:
            sys.stdout.write('Test passed after {} episodes\n'.format(
                runner.episode))
            sys.stdout.flush()
            self.assertTrue(expr=True)
Example #7
0
def get_ppo_agent(environment, *args, **kwargs):
    with open('config/cnn_network.json', 'r') as infile:
        network = json.load(infile)

    agent = PPOAgent(
        states=environment.states,
        actions=environment.actions,
        network=network,
        memory={
            "type": "latest",
            "capacity": 40000,
            "include_next_states": False,
        },
        actions_exploration={
            "type": "epsilon_anneal",
            "initial_epsilon": 1.0,
            "final_epsilon": 0.05,
            "timesteps": int(1e7),
        },
        saver={
            "directory": "checkpoint/ppo",
            "seconds": 1800,
        },
    )

    return agent
 def ppo(env):
     return PPOAgent(
         states=dict(type='float', shape=env.state_representation.get_shape()),
         actions=dict(type='int', num_actions=env.env.action_space.N_ACTIONS),
         # Automatically configured network
         #network=dict(type='auto', size=32, depth=2, internal_rnn=True),
         network=[
             dict(type='dense', size=128),
             dict(type='dense', size=128),
             dict(type='dense', size=128)
         ],
         # Update every 5 episodes, with a batch of 10 episodes
         update_mode=dict(unit='episodes', batch_size=10, frequency=5),
         # Memory sampling most recent experiences, with a capacity of 2500 timesteps
         # (2500 > [10 episodes] * [200 max timesteps per episode])
         memory=dict(type='latest', include_next_states=False, capacity=250000),
         discount=0.99, entropy_regularization=0.01,
         # MLP baseline
         baseline_mode='states', baseline=dict(type='mlp', sizes=[32, 32]),
         # Baseline optimizer
         baseline_optimizer=dict(
             type='multi_step', optimizer=dict(type='adam', learning_rate=1e-3), num_steps=5
         ),
         gae_lambda=0.97, likelihood_ratio_clipping=0.2,
         # PPO optimizer
         step_optimizer=dict(type='adam', learning_rate=3e-4), # was -4
         # PPO multi-step optimization: 25 updates, each calculated for 20% of the batch
         subsampling_fraction=0.2, optimization_steps=25
     )
Example #9
0
    def initialize(self, env, lstm=False):
        from gym import spaces
        from tensorforce.agents import PPOAgent

        if self.algorithm == "ppo":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_actions': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_actions=env.action_space.n)

            network = [
                    dict(type='conv2d', size=10, window=1, activation='relu'),
                    dict(type='conv2d', size=32, window=5, activation='relu'),
                    dict(type='conv2d', size=16, window=3, activation='relu'),
                    dict(type='flatten'),
                    dict(type='dense', size=256, activation='relu')
                ]

            if lstm:
                network.append(dict(type='internal_lstm', size=256))

            return PPOAgent(
                states=dict(type='float', shape=env.observation_space.shape),
                actions=actions,
                network=network,
                batching_capacity=1000,
                step_optimizer=dict(type='adam', learning_rate=1e-4))
        return None
Example #10
0
    def initialize(self, env):
        from gym import spaces
        from tensorforce.agents import PPOAgent

        if self.algorithm == "ppo":
            if type(env.action_space) == spaces.Tuple:
                actions_spec = {str(num): {'type': int, 'num_actions': space.n}
                                for num, space in enumerate(env.action_space.spaces)}
            else:
                actions_spec = dict(type='int', num_actions=env.action_space.n)

            return PPOAgent(
                states_spec=dict(type='float', shape=env.observation_space.shape),
                actions_spec=actions_spec,
                network_spec=[
                    dict(type='dense', size=64),
                    dict(type='dense', size=64)
                ],
                batch_size=128,
                step_optimizer=dict(
                    type='adam',
                    learning_rate=1e-4
                )
            )
        return None
Example #11
0
 def createPPO2Agent():
   # based on: https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py
   agent = PPOAgent(
     states=env.states,
     actions=env.actions,
     network=[
       dict(type='dense', size=64),
       dict(type='dense', size=32)
     ],
     # Agent
     states_preprocessing=None,
     actions_exploration=None,
     reward_preprocessing=None,
     # MemoryModel
     update_mode=dict(
         unit='episodes',
         # 10 episodes per update
         batch_size=10,
         # Every 10 episodes
         frequency=10
     ),
     memory=dict(
         type='latest',
         include_next_states=False,
         capacity=5000
     ),
     # DistributionModel
     distributions=None,
     entropy_regularization=0.01,
     # PGModel
     baseline_mode='states',
     baseline=dict(
         type='mlp',
         sizes=[32, 32]
     ),
     baseline_optimizer=dict(
         type='multi_step',
         optimizer=dict(
             type='adam',
             learning_rate=1e-3
         ),
         num_steps=5
     ),
     gae_lambda=0.97,
     # PGLRModel
     likelihood_ratio_clipping=0.2,
     # PPOAgent
     step_optimizer=dict(
         type='adam',
         learning_rate=1e-3
     ),
     subsampling_fraction=0.2,
     optimization_steps=25,
     execution=dict(
         type='single',
         session_config=None,
         distributed_spec=None
     )
   )
   return agent
Example #12
0
    def __init__(self,
                 observation_space,
                 action_space,
                 directory='./TensorforcePPOAgent/'):

        # Create a Proximal Policy Optimization agent:
        # This agent is restricted to a 0 or 1 activation. To enable continuous activations, change the action type to "float" and delete "num_actions".

        self.agent = PPOAgent(
            states=dict(type='float', shape=observation_space.shape),
            actions=dict(type='int',
                         shape=action_space.shape,
                         num_actions=2,
                         min_value=0,
                         max_value=1),
            # This PPO Agent neural network has two dense hidden layers with 256 nodes.
            network=[
                dict(type='dense', size=256),
                dict(type='dense', size=256),
            ],

            # The agent uses an "Adam" optimizer with a learning rate of .0001
            batching_capacity=1000,
            step_optimizer=dict(type='adam', learning_rate=1e-4))
        self.directory = directory
    def __init__(self,
                 observation_space,
                 action_space,
                 directory='./TensorforcePPOAgent/'):
        # Create a Proximal Policy Optimization agent

        self.agent = PPOAgent(states=dict(type='float',
                                          shape=observation_space.shape),
                              actions=dict(type='float',
                                           shape=action_space.shape,
                                           min_value=0,
                                           max_value=1),
                              network=[
                                  dict(type='dense',
                                       size=256,
                                       activation='relu'),
                                  dict(type='dense',
                                       size=128,
                                       activation='relu'),
                                  dict(type='dense',
                                       size=64,
                                       activation='relu'),
                                  dict(type='dense',
                                       size=32,
                                       activation='relu'),
                              ],
                              batching_capacity=1000,
                              step_optimizer=dict(type='adam',
                                                  learning_rate=1e-2))
        self.directory = directory
def main():
    env = gym.make('CartPole-v0')

    # (4,)
    print(env.observation_space.shape)
    # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
    print(env.observation_space.high)
    # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
    print(env.observation_space.low)
    # 2
    print(env.action_space.n)

    agent = PPOAgent(
        states=dict(type='float', shape=env.observation_space.shape),
        network=[
            dict(type='dense', size=32, activation='relu'),
            dict(type='dense', size=32, activation='relu'),
        ],
        actions=dict(type='int', num_actions=env.action_space.n),
        step_optimizer=dict(type='adam', learning_rate=1e-4)
    )

    model_dir = 'models/cartpole'

    if os.path.exists(f'{model_dir}/checkpoint'):
        agent.restore_model(directory=model_dir)

    try:
        for ep in range(2000):
            observation = env.reset()
            done = False
            ep_reward = 0
            while not done:
                # env.render()

                states = observation / 4

                action = agent.act(states=states)

                observation, reward, done, info = env.step(action)

                agent.observe(reward=reward, terminal=done)

                ep_reward += reward

                if done:
                    print(f'ep = {ep}, ep_reward = {ep_reward}')
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=f'{model_dir}/agent')
Example #15
0
    def __init__(self):

        actions = {}
        for i in range(12):
            actions[str(i)] = {'type': 'float'}  # 'num_actions': 10

        network_spec = [
            dict(type='dense', size=100, activation='relu'),
            dict(type='dense', size=100, activation='relu')
        ]

        self.agent = PPOAgent(
            states=dict(type='float', shape=(12, )),
            actions=actions,
            batching_capacity=2000,
            network=network_spec,
            step_optimizer=dict(type='adam', learning_rate=1e-4),
        )
Example #16
0
    def __init__(self):

        actions = {}
        actions_exp = {}
        for i in range(12):
            actions[str(i)] = {'type': 'float'}  # 'num_actions': 10
            actions_exp[str(i)] = dict(type='ornstein_uhlenbeck',
                                       sigma=0.1,
                                       mu=0.0,
                                       theta=0.1)

        preprocessing_config = [{"type": "standardize"}]

        preprocessing_config = None

        customnet = dict(type=CustomNetwork)
        layerSize = 300
        network_spec = [
            dict(type='dense', size=100),
            dict(type='lstm', size=100)
        ]
        '''
        network_spec = [
                            dict(type='dense', size=100),
                           dict(type='internal_lstm', size=100)
                       ]
       
        '''

        network_spec = [
            dict(type='dense', size=layerSize, activation='selu'),
            dict(type='dense', size=layerSize, activation='selu'),
            dict(type='dense', size=layerSize, activation='selu')
        ]

        self.agent = PPOAgent(
            states=dict(type='float', shape=(12 + 9, )),
            actions=actions,
            batching_capacity=1000,
            network=network_spec,
            states_preprocessing=preprocessing_config,
            actions_exploration=actions_exp,
            step_optimizer=dict(type='adam', learning_rate=1e-5),
        )
Example #17
0
    def __init__(self, apikey, agent_id, frames_per_state=1, host=None):

        # PPO agent seems to learn that it needs to speed around the environment to collect rewards
        self._agent = PPOAgent(
            states_spec=dict(type='float', shape=(frames_per_state * 25, )),
            actions_spec=dict(type='float',
                              shape=(3, ),
                              min_value=np.float32(-1.0),
                              max_value=np.float32(1.0)),
            network_spec=[
                dict(type='dense', activation='relu', size=128),
                dict(type='dense', activation='relu', size=128),
            ],
            optimization_steps=5,
            # Model
            scope='ppo',
            discount=0.99,
            # DistributionModel
            distributions_spec=None,
            entropy_regularization=0.01,
            # PGModel
            baseline_mode=None,
            baseline=None,
            baseline_optimizer=None,
            gae_lambda=None,
            # PGLRModel
            likelihood_ratio_clipping=0.2,
            summary_spec=None,
            distributed_spec=None,
            batch_size=2048,
            step_optimizer=dict(type='adam', learning_rate=1e-4))

        self._logger = setup_custom_logger("Controller")

        self._frame_count_per_episode = 0
        self._total_frames = 1
        self._frames_per_state = frames_per_state

        self._client = AsyncClient(apikey, agent_id,
                                   self._train_state_callback, host)

        self._state_stack = StateStack(self._frames_per_state)
Example #18
0
 def createPPOAgent():
   agent = PPOAgent(
     states = env.states,
     actions = env.actions,
     network=[
       dict(type='dense', size=networkFirstLayer),
       dict(type='dense', size=int((networkFirstLayer*networkLastLayer)**0.5)), # geometric average of first and last
       dict(type='dense', size=networkLastLayer),
     ],
     step_optimizer=dict(type='adam', learning_rate=1e-2)
   )
   return agent
    def test_example(self):
        sys.stdout.write('\nQuickstart:\n')
        sys.stdout.flush()

        passed = 0
        for _ in xrange(3):

            # Create an OpenAI-Gym environment
            environment = OpenAIGym('CartPole-v0')

            # Network specification for the model
            network_spec = [
                dict(type='dense', size=32),
                dict(type='dense', size=32)
            ]

            # Create the agent
            agent = PPOAgent(states_spec=environment.states,
                             actions_spec=environment.actions,
                             network_spec=network_spec,
                             batch_size=4000,
                             step_optimizer=dict(type='adam',
                                                 learning_rate=1e-2),
                             optimization_steps=5,
                             discount=0.99,
                             normalize_rewards=False,
                             entropy_regularization=0.01,
                             likelihood_ratio_clipping=0.2)

            # Initialize the runner
            runner = Runner(agent=agent, environment=environment)

            # Function handle called after each finished episode
            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                mean_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or mean_reward < 50.0

            # Start the runner
            runner.run(episodes=2000,
                       max_episode_timesteps=200,
                       episode_finished=episode_finished)

            sys.stdout.write('episodes: {}\n'.format(runner.episode))
            sys.stdout.flush()

            # Test passed if episode_finished handle evaluated to False
            if runner.episode < 2000:
                passed += 1

        sys.stdout.write('==> passed: {}\n'.format(passed))
        sys.stdout.flush()
        self.assertTrue(passed >= 2)
Example #20
0
    def __init__(self, name, light, price, quantity, avg_cost_estimate):
        # initalize product
        self.name = name

        # initialize state
        self.light = light
        self.quantity = quantity
        self.avg_cost_estimate = avg_cost_estimate  # the approximated cost of each item sold

        self.price = price  # what the price is being set at

        self.history_log = []  # history of product over time

        # initalize agent
        self.agent = PPOAgent(
            states=dict(type='float', shape=(4)),
            actions=dict(type='int', num_actions=len(PRICE_CHANGES)),
            network=[dict(type='dense', size=4),
                     dict(type='dense', size=4)],
            step_optimizer=dict(type='adam', learning_rate=0.01))
        self.agent.initialize_model()
Example #21
0
def getAgent(shapeIn, shapeOut):

    config = Configuration(batch_size=1,
                           step_optimizer=dict(type='adam',
                                               learning_rate=1e-4))

    # Create a Proximal Policy Optimization agent
    agent = PPOAgent(dict(type='float', shape=shapeIn[0]),
                     dict(type='float', shape=shapeOut[0]), [
                         dict(type='dense', size=64),
                     ], config)

    return agent
Example #22
0
def create_agent(environment, network_spec):
    return PPOAgent(update_mode=dict(unit='episodes',
                                     batch_size=4,
                                     frequency=4),
                    memory=dict(type='latest',
                                include_next_states=False,
                                capacity=100),
                    step_optimizer=dict(type='adam', learning_rate=1e-3),
                    subsampling_fraction=0.3,
                    optimization_steps=20,
                    states=environment.states,
                    actions=environment.actions,
                    network=network_spec)
Example #23
0
class TensorforceAgent:
    def __init__(self,actions):
        preprocessing_config = [
            {
                "type": "grayscale"
            }
        ]
        exploration_config = dict(
            type="epsilon_anneal",
            initial_epsilon=0.25,
            final_epsilon=0.01,
            timesteps=1000000
        )

        network_spec = [
            dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'),
            dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'),
            dict(type='flatten'),
            dict(type='dense', size=256, activation='lrelu')
        ]
        self.network_path = "network/"
        self.agent = PPOAgent(
            actions = dict(type='int', num_actions=len(actions)),
            states = dict(type='float', shape=(35, 150, 3)),
            network = network_spec,
            actions_exploration = exploration_config,
            states_preprocessing = preprocessing_config
        )

    def act(self, obs):
        #Cut out only the part needed
        partly = np.delete(obs, np.s_[96:], 0)
        partly = np.delete(partly, np.s_[0:26], 0)
        partly = np.delete(partly, np.s_[35:45], 0)
        partly = np.delete(partly, np.s_[38:53], 0)
        partly = np.delete(partly, np.s_[31:35], 0)
        partly = np.delete(partly, np.s_[10:16], 0)
        frame = np.delete(partly, np.s_[150:], 1)

        #scipy.misc.imsave('outfile.jpg', frame)

        return self.agent.act(frame)

    def load(self):
        import os
        if os.path.isdir(self.network_path):
            try:
                self.agent.restore_model(self.network_path)
            except:
                print("Failed to load model")

    def observe(self, terminal = False, reward = 0):
        return self.agent.observe(terminal, reward)

    def save_model(self):
        import os
        if not os.path.isdir(self.network_path):
            os.makedirs(self.network_path)
        self.agent.save_model(self.network_path)
Example #24
0
 def initialize(self, env):
     if self.algorithm == "ppo":
         return PPOAgent(states_spec=dict(
             type='float', shape=env.observation_space.shape),
                         actions_spec=dict(type='int',
                                           num_actions=env.action_space.n),
                         network_spec=[
                             dict(type='dense', size=64),
                             dict(type='dense', size=64)
                         ],
                         batch_size=128,
                         step_optimizer=dict(type='adam',
                                             learning_rate=1e-4))
     return None
Example #25
0
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=256,

                memory=dict(
                    type='prioritized_replay',
                ),
                update_frequency=256,
                first_update=512,

                learning_rate=0.0001,
                optimizer_batch_size=64,
                normalize_rewards=False,
                gae_rewards=False,
                baseline=dict(
                    type="mlp",
                    sizes=[32, 32],
                    epochs=1,
                    update_batch_size=64,
                    learning_rate=0.001
                ),
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
Example #26
0
    def __init__(self, state_size, env=None, is_eval=False):
        self.state_size = state_size
        self.action_size = 3
        self._memory_size = 1000
        self._memory = deque(maxlen=1000)
        self.inventory = pd.DataFrame(columns=['Price', 'POS', 'Order'])
        self.is_eval = is_eval
        self.learning_rate = env.hyperparameters['learning_rate']
        self.gamma = env.hyperparameters['gamma']
        self.env = env

        PPOAgent.__init__(self,
                           states = dict(type='float', shape=self.state_size.shape),
                           actions = dict(type='int', num_actions=self.action_size),
                           network = env.get_network(),
                           discount = self.gamma,
                           batching_capacity = env.batch_size * 100,
                           actions_exploration = env.exploration)
                           #step_optimizer = self.get_optimizer(),
                           #actions_exploration = self.explo)
                           #update_mode = self._update_mode,
                           #batching_capacity = self._memory_size)

        self._load_model()
    def test_readme(self):
        environment = UnittestEnvironment(states=dict(type='float',
                                                      shape=(10, )),
                                          actions=dict(type='int',
                                                       num_values=5))

        def get_current_state():
            return environment.reset()

        def execute_decision(x):
            return environment.execute(actions=x)[2]

        # Instantiate a Tensorforce agent
        agent = PPOAgent(states=dict(type='float', shape=(10, )),
                         actions=dict(type='int', num_values=5),
                         memory=10000,
                         network='auto',
                         update_mode=dict(unit='episodes', batch_size=10),
                         step_optimizer=dict(type='adam', learning_rate=1e-4))

        # Initialize the agent
        agent.initialize()

        # Retrieve the latest (observable) environment state
        state = get_current_state()  # (float array of shape [10])

        # Query the agent for its action decision
        action = agent.act(states=state)  # (scalar between 0 and 4)

        # Execute the decision and retrieve the current performance score
        reward = execute_decision(action)  # (any scalar float)

        # Pass feedback about performance (and termination) to the agent
        agent.observe(reward=reward, terminal=False)

        agent.close()
        environment.close()
        self.assertTrue(expr=True)
Example #28
0
class ForwardActor:
    def __init__(self):

        actions = {}
        for i in range(12):
            actions[str(i)] = {'type': 'float'}  # 'num_actions': 10

        network_spec = [
            dict(type='dense', size=100, activation='relu'),
            dict(type='dense', size=100, activation='relu')
        ]

        self.agent = PPOAgent(
            states=dict(type='float', shape=(12, )),
            actions=actions,
            batching_capacity=2000,
            network=network_spec,
            step_optimizer=dict(type='adam', learning_rate=1e-4),
        )

    def act(self, state):
        jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])),
                            axis=0)
        jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0)

        #actiondict = self.agent.act( np.concatenate([jp,jv],axis=1))
        actiondict = self.agent.act(jp)

        action = np.zeros(12)
        for i in range(12):
            action[i] = actiondict[str(i)][0]
        action = np.nan_to_num(action)
        #print(action)
        return np.clip(action, -1.0, 1.0)

    def observe(self, reward, terminal):
        self.agent.observe(reward=reward, terminal=terminal)

    def save(self, directory):
        self.agent.save_model(directory=directory)

    def restore(self, directory):
        self.agent.restore_model(directory=directory)
def main():
    # Create an OpenAI-Gym environment
    environment = OpenAIGym('CartPole-v1')

    # Create the agent
    agent = PPOAgent(
        states=environment.states(),
        actions=environment.actions(),
        # Automatically configured network
        network='auto',
        # Memory sampling most recent experiences, with a capacity of 2500 timesteps
        # (6100 > [30 batch episodes] * [200 max timesteps per episode])
        memory=6100,
        # Update every 10 episodes, with a batch of 30 episodes
        update_mode=dict(unit='episodes', batch_size=30, frequency=10),
        # PPO optimizer
        step_optimizer=dict(type='adam', learning_rate=1e-3),
        # PPO multi-step optimization: 10 updates, each based on a third of the batch
        subsampling_fraction=0.33,
        optimization_steps=10,
        # MLP baseline
        baseline_mode='states',
        baseline=dict(type='network', network='auto'),
        # Baseline optimizer
        baseline_optimizer=dict(type='multi_step',
                                optimizer=dict(type='adam',
                                               learning_rate=1e-4),
                                num_steps=5),
        # Other parameters
        discount=0.99,
        entropy_regularization=1e-2,
        gae_lambda=None,
        likelihood_ratio_clipping=0.2)

    # Initialize the runner
    runner = Runner(agent=agent, environment=environment)

    # Start the runner
    runner.run(num_episodes=1000, max_episode_timesteps=200)
    runner.close()
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=4096,
                gae_lambda=0.97,
                learning_rate=0.001,
                entropy_penalty=0.01,
                epochs=5,
                optimizer_batch_size=512,
                loss_clipping=0.2,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000,
                       max_timesteps=200,
                       episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)