Esempio n. 1
0
    def test_dqn_agent(self):
        config = {
            'seed': 10,
            'batch_size': 16,
            'state_shape': (2, ),
            'actions': 2,
            'action_shape': (),
            'update_rate': 1,
            'update_repeat': 4,
            'min_replay_size': 50,
            'memory_capacity': 50,
            "exploration": "epsilon_decay",
            "exploration_param": {
                "epsilon": 1,
                "epsilon_final": 0,
                "epsilon_states": 50
            },
            'target_network_update_rate': 1.0,
            'use_target_network': True,
            "alpha": 0.0005,
            "gamma": 0.99,
            "tau": 1.0
        }

        tf.reset_default_graph()
        tf.set_random_seed(10)

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(
            layers=[{
                'type': 'dense',
                'num_outputs': 16
            }, {
                'type': 'linear',
                'num_outputs': 2
            }])
        agent = DQNAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in xrange(10000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = False
            agent.add_observation(state=state,
                                  action=action,
                                  reward=reward,
                                  terminal=terminal)
            rewards[n % 100] = reward

            if sum(rewards) == 100.0:
                return

        assert (sum(rewards) == 100.0)
Esempio n. 2
0
def main():
    env = OpenAIGym("P3DX-v0")

    agent = DQNAgent(states=dict(type='float', shape=(80, 80, 4)),
                     actions=dict(type='int', num_actions=7),
                     network=[
                         dict(type="conv2d",
                              size=16,
                              window=[8, 8],
                              stride=4,
                              activation="relu"),
                         dict(type="conv2d",
                              size=32,
                              window=[4, 4],
                              stride=2,
                              activation="relu"),
                         dict(type="flatten"),
                         dict(type="dense", size=256)
                     ],
                     actions_exploration=dict(type="epsilon_decay",
                                              initial_epsilon=1.0,
                                              final_epsilon=0.1,
                                              timesteps=1000),
                     memory=dict(type="replay",
                                 capacity=1000,
                                 include_next_states=True),
                     update_mode=dict(unit="timesteps",
                                      batch_size=16,
                                      frequency=4),
                     discount=0.99,
                     entropy_regularization=None,
                     double_q_model=True,
                     optimizer=dict(type="adam", learning_rate=1e-4))

    try:
        agent.restore_model(directory="modelo/", file="data-129235")
        print("Found data!")
    except Exception as e:
        print(e)
        print("Can't load data")

    print("Starting execution")
    state = env.reset()
    agent.reset()
    try:
        while True:
            # Get action - no exploration and no observing
            action = agent.act(state, deterministic=True, independent=True)
            print(action)

            # Execute action in the environment
            state, terminal_state, reward = env.execute(action)

            if terminal_state:
                raise KeyboardInterrupt
    except KeyboardInterrupt:
        print("Terminal state", terminal_state)
        state = env.reset()
        agent.reset()
    def test_lstm(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32),
                                       dict(type='lstm')
                                   ]))
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQN agent (LSTM): ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQN agent (LSTM) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def test_multi(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            return state0 * state1

        for _ in xrange(5):
            environment = MinimalTest(definition=[False, (False, 2)])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 15 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-15:], r.episode_lengths[-15:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQN agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQN agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 5
0
    def test_replay(self):
        environment = MinimalTest(definition=[(False, (1, 2))])
        config = Configuration(batch_size=8,
                               learning_rate=0.001,
                               memory_capacity=50,
                               memory=dict(type='replay',
                                           random_sampling=True),
                               first_update=20,
                               target_update_frequency=10,
                               states=environment.states,
                               actions=environment.actions,
                               network=layered_network_builder([
                                   dict(type='dense', size=32),
                                   dict(type='dense', size=32)
                               ]))
        agent = DQNAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(
                x / l >= reward_threshold for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Replay memory DQN: ' + str(runner.episode))
Esempio n. 6
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   repeat_update=4,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=5000, episode_finished=episode_finished)
            print('DQN Agent: ' + str(runner.episode))
            if runner.episode < 5000:
                passed += 1
                print('passed')
            else:
                print('failed')

        print('DQN Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 7
0
    def initialize(self, env):
        from gym import spaces
        from tensorforce.agents import DQNAgent

        if self.algorithm == "dqn":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_actions': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_actions=env.action_space.n)

            return DQNAgent(states=dict(type='float',
                                        shape=env.observation_space.shape),
                            actions=actions,
                            network=[
                                dict(type='dense', size=128),
                                dict(type='dense', size=128)
                            ],
                            batching_capacity=100)
        return None
Esempio n. 8
0
def main():
    gym_id = 'CartPole-v0'
    max_episodes = 10000
    max_timesteps = 1000

    env = OpenAIGym(gym_id)
    network_spec = [
        dict(type='dense', size=32, activation='tanh'),
        dict(type='dense', size=32, activation='tanh')
    ]

    agent = DQNAgent(
        states_spec=env.states,
        actions_spec=env.actions,
        network_spec=network_spec,
        batch_size=64
    )

    runner = Runner(agent, env)
    
    report_episodes = 10

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logging.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logging.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logging.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))

    runner.run(max_episodes, max_timesteps, episode_finished=episode_finished)
    
    print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
Esempio n. 9
0
    def test_introduction_dqnagent(self):
        from tensorforce import Configuration
        from tensorforce.agents import DQNAgent
        from tensorforce.core.networks import layered_network_builder

        # Define a network builder from an ordered list of layers
        layers = [dict(type='dense', size=32), dict(type='dense', size=32)]
        network = layered_network_builder(layers_config=layers)

        # Define a state
        states = dict(shape=(10, ), type='float')

        # Define an action (models internally assert whether
        # they support continuous and/or discrete control)
        actions = dict(continuous=False, num_actions=5)

        # The agent is configured with a single configuration object
        agent_config = Configuration(batch_size=8,
                                     learning_rate=0.001,
                                     memory_capacity=800,
                                     first_update=80,
                                     repeat_update=4,
                                     target_update_frequency=20,
                                     states=states,
                                     actions=actions,
                                     network=network)
        agent = DQNAgent(config=agent_config)
Esempio n. 10
0
def load_agent(config, env, network_spec):
    if isfile(join(config.agent_dir, config.agent_name + ".json")):
        return TensorforceAgent.load(config.agent_dir, config.agent_name,
                                     "checkpoint", env)
    return DQNAgent(states=env.states(),
                    actions=env.actions(),
                    network=network_spec,
                    **config.agent_specs)
def get_dqn_agent():
    return DQNAgent(
        states=dict(type='float', shape=(5, )),
        actions=dict(type='int', num_actions=2),
        network=[dict(type='dense', size=20),
                 dict(type='dense', size=20)],
        batched_observe=False,
    )
Esempio n. 12
0
def get_agent(agentType):

    if agentType == "dqn":
        agent = DQNAgent(
            states={
                "type": 'float',
                "shape": (
                    int(args.population),
                    1,
                    int(args.resources),
                )
            },
            actions={
                "type": 'int',
                "shape": (int(args.resources), ),
                "num_values": 3
            },
            memory=1000,
            network="auto",
        )
    elif agentType == "vpg":
        agent = VPGAgent(
            states={
                "type": 'float',
                "shape": (
                    int(args.population),
                    1,
                    int(args.resources),
                )
            },
            actions={
                "type": 'int',
                "shape": (int(args.resources), ),
                "num_values": 3
            },
            network="auto",
            memory=1000,
        )
    elif agentType == "trpo":
        agent = TRPOAgent(
            states={
                "type": 'float',
                "shape": (
                    int(args.population),
                    1,
                    int(args.resources),
                )
            },
            actions={
                "type": 'int',
                "shape": (int(args.resources), ),
                "num_values": 3
            },
            network="auto",
            memory=1000,
        )

    return agent
Esempio n. 13
0
def overwrite_agent(env, network_spec, config):
    onlyfiles_agent = [
        f for f in listdir(config.agent_dir) if
        isfile(join(config.agent_dir, f)) and f.startswith(config.agent_name)
    ]
    for f in onlyfiles_agent:
        remove(join(config.agent_dir, f))
    return DQNAgent(states=env.states(),
                    actions=env.actions(),
                    network=network_spec,
                    **config.agent_specs)
Esempio n. 14
0
    def __init__(self, state_size, env=None, is_eval=False):

        self.state_size = state_size
        self.action_size = 3
        self._memory_size = 1000
        self._memory = deque(maxlen=1000)
        self.is_eval = is_eval
        self.env = env


        DQNAgent.__init__(self,
                           states = dict(type='float', shape=self.state_size.shape),
                           actions = dict(type='int', num_actions=self.action_size),
                           network = env.get_network(),
                           discount = env.hyperparameters['gamma'],
                           batching_capacity = 10000,
                           double_q_model = True,
                           actions_exploration = env.exploration)

        self._load_model()
Esempio n. 15
0
    def test_trpo_agent(self):
        config = {
            'batch_size': 8,
            'max_episode_length': 4,
            'continuous': False,
            'state_shape': (2, ),
            'actions': 2,
            'action_shape': ()
        }
        tf.reset_default_graph()

        config = create_config(config)
        network_builder = NeuralNetwork.layered_network(
            layers=[{
                'type': 'dense',
                'num_outputs': 32
            }, {
                'type': 'linear',
                'num_outputs': 2
            }])
        agent = DQNAgent(config=config, network_builder=network_builder)

        state = (1, 0)
        rewards = [0.0] * 100
        for n in range(1000):
            action = agent.get_action(state=state)
            if action == 0:
                state = (1, 0)
                reward = 0.0
                terminal = False
            else:
                state = (0, 1)
                reward = 1.0
                terminal = True
            agent.add_observation(state=state,
                                  action=action,
                                  reward=reward,
                                  terminal=terminal)
            rewards[n % 100] = reward
            if sum(rewards) == 100.0:
                return
Esempio n. 16
0
 def agent(self):
     nb_actions = self.env.action_space.n
     obs_dim = len(self.env.observation_space.spaces)
     obs_dim_2 = self.env.observation_space.spaces[0].shape
     return DQNAgent(states=dict(type='float32',
                                 shape=(obs_dim, obs_dim_2)),
                     actions=dict(type='int', num_actions=nb_actions),
                     network=[
                         dict(type=Permute, dims=(0, 2, 3, 1)),
                         dict(type='conv2d',
                              size=5,
                              window=(obs_dim, 1),
                              stride=(obs_dim, 1),
                              padding='SAME',
                              l2_regularization=1e-4),
                         dict(type=BatchNormalization),
                         dict(type=Permute, dims=(0, 3, 2, 1)),
                         dict(type='conv2d',
                              size=5,
                              window=(5, 1),
                              stride=(5, 1),
                              padding='SAME',
                              l2_regularization=1e-4),
                         dict(type=BatchNormalization),
                         dict(type=Permute, dims=(0, 3, 2, 1)),
                         dict(type='conv2d',
                              size=5,
                              window=(5, 1),
                              stride=(5, 1),
                              padding='SAME',
                              l2_regularization=1e-4),
                         dict(type=BatchNormalization),
                         dict(type=Permute, dims=(0, 3, 2, 1)),
                         dict(type='conv2d',
                              size=5,
                              window=(5, 1),
                              stride=(5, 1),
                              padding='SAME',
                              l2_regularization=1e-4),
                         dict(type=BatchNormalization),
                         dict(type='flatten'),
                         dict(type='dense',
                              size=nb_actions,
                              activation='softmax')
                     ],
                     states_preprocessing=[dict(type=Conv2DPreprocessor)])
Esempio n. 17
0
    def test_multi(self):
        """
        This is relatively unstable and highly depends on initialisation - either passes quickly
        or fails no matter what.

        """
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32, scope='state0-1'), size=32, scope='state0-2')
            state1 = layer(x=layer(x=inputs['state1'], size=32, scope='state1-1'), size=32, scope='state1-2')
            return state0 * state1

        for _ in xrange(5):
            environment = MinimalTest(definition=[False, (False, 2)])
            config = Configuration(
                batch_size=8,
                learning_rate=0.0001,
                memory_capacity=800,
                first_update=80,
                target_update_frequency=20,
                repeat_update=4,
                memory=dict(
                    type='prioritized_replay',
                ),
                states=environment.states,
                actions=environment.actions,
                network=network_builder
            )
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 15 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-15:],
                                                                                           r.episode_lengths[-15:]))

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('DQN agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('DQN agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 3)
def create_agent(memory, double_model, environment):
    # create the agent
    agent = DQNAgent(states=environment.states, actions=environment.actions, network=network_spec, double_q_model=double_model, memory=memory,
        update_mode=None,
        optimizer=dict(
            type='adam',
            learning_rate=1e-3
        ),
        states_preprocessing=[dict(type='image_resize', width=128, height=128),
            dict(type='grayscale'),
            dict(type='divide',scale=255)],
        target_sync_frequency=1000,
        # Comment in to test exploration types
        actions_exploration=dict(
            type="epsilon_decay",
            initial_epsilon=1.0,
            final_epsilon=0.05,
            timesteps=250000
        )
    )
    return agent
def create_agent(memory, double_model, environment):
    # create the agent
    agent = DQNAgent(
        states=environment.states,
        actions=environment.actions,
        network=network_spec,
        double_q_model=double_model,
        memory=memory,
        update_mode=None,
        optimizer=dict(type='adam', learning_rate=1e-3),
        states_preprocessing=[
            dict(type='running_standardize'),
            dict(type='sequence')
        ],
        target_sync_frequency=1000,
        # Comment in to test exploration types
        actions_exploration=dict(type="epsilon_decay",
                                 initial_epsilon=1.0,
                                 final_epsilon=0.1,
                                 timesteps=3500000))
    return agent
    def test_prioritized_replay(self):
        environment = MinimalTest(definition=[(False, (1, 2))])
        config = Configuration(batch_size=8,
                               learning_rate=0.001,
                               memory_capacity=50,
                               memory='prioritized_replay',
                               first_update=20,
                               target_update_frequency=10,
                               states=environment.states,
                               actions=environment.actions,
                               network=layered_network_builder([
                                   dict(type='dense', size=32),
                                   dict(type='dense', size=32)
                               ]))
        agent = DQNAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(
                x >= 1.0 for x in r.episode_rewards[-100:])

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Prioritized replay memory DQN: ' + str(runner.episode))
Esempio n. 21
0
 def __init__(self, states, actions):
     self.q_agent = DQNAgent(
         states,
         actions,
         network=[
             dict(type='conv2d', size=3),
             dict(type='flatten'),
             dict(type='dense', size=64),
             dict(type='dense', size=64)
         ],
         update_mode=dict(unit='timesteps', batch_size=8, frequency=8),
         memory=dict(type='replay', include_next_states=True, capacity=200),
         optimizer=dict(type='adam', learning_rate=1e-2),
         states_preprocessing=[
             dict(type='normalize'),
             #dict(type='running_standardize'),
             #dict(type='sequence')
         ],
         target_sync_frequency=10,
         actions_exploration=dict(type="epsilon_decay",
                                  initial_epsilon=1.0,
                                  final_epsilon=0.1,
                                  timesteps=10))
Esempio n. 22
0
#---------------------------------------------------------------------

from tensorforce.agents import DQNAgent

# Network is an ordered list of layers.
network_spec = [
    dict(type='dense', size=32, activation='tanh'),
    dict(type='dense', size=32, activation='tanh')
]

# Define a state.
states = dict(shape=(10, ), type='float')
#states = dict(
#	image=dict(shape=(64, 64, 3), type='float'),
#	caption=dict(shape=(20,), type='int')
#)

# Define an action.
actions = dict(type='int', num_actions=5)

# The agent is configured with a single configuration object.
config = dict(memory=dict(type='replay', capacity=1000),
              batch_size=8,
              first_update=100,
              target_sync_frequency=10)

agent = DQNAgent(states_spec=states,
                 actions_spec=actions,
                 network_spec=network_spec,
                 **config)
Esempio n. 23
0
        baseline=dict(
            type="cnn",
            conv_sizes=[32, 32],
            dense_sizes=[32]
        ),
        baseline_optimizer=dict(
            type="multi_step",
            optimizer=dict(
                type="adam",
                learning_rate=1e-3
            ),
            num_steps=5
        )
    )

    agent = DQNAgent(**dqn)
    #agent = PPOAgent(**ppo)
    statistics = {}
    actions = [0 for x in range(16)]
    s = time.time()
    skip_steps = 8
    g.flip_player()
    for i in range(100000):
        state = g.reset()

        while not g.is_terminal():
            state = cv2.resize(state, (80, 80))
            # Perform Action
            action = agent.act(state)
            actions[action] += 1
            _, r, t, _ = g.step(action)
    def test_blogpost_introduction_runner(self):
        from tensorforce.environments.minimal_test import MinimalTest
        from tensorforce.agents import DQNAgent
        from tensorforce.execution import Runner

        environment = MinimalTest(specification=[('int', ())])

        network_spec = [
            dict(type='dense', size=32)
        ]

        agent = DQNAgent(
            states_spec=environment.states,
            actions_spec=environment.actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=50
        )
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(runner):
            if runner.episode % 100 == 0:
                print(sum(runner.episode_rewards[-100:]) / 100)
            return runner.episode < 100 \
                or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:])

        # runner.run(episodes=1000, episode_finished=episode_finished)
        runner.run(episodes=10, episode_finished=episode_finished)  # Only 10 episodes for this test

        ### Code block: next
        agent = DQNAgent(
            states_spec=environment.states,
            actions_spec=environment.actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=50
        )

        # max_episodes = 1000
        max_episodes = 10  # Only 10 episodes for this test
        max_timesteps = 2000

        episode = 0
        episode_rewards = list()

        while True:
            state = environment.reset()
            agent.reset()

            timestep = 0
            episode_reward = 0
            while True:
                action = agent.act(states=state)
                state, terminal, reward = environment.execute(actions=action)
                agent.observe(terminal=terminal, reward=reward)

                timestep += 1
                episode_reward += reward

                if terminal or timestep == max_timesteps:
                    break

            episode += 1
            episode_rewards.append(episode_reward)

            if all(reward >= 1.0 for reward in episode_rewards[-100:]) or episode == max_episodes:
                break

        agent.close()
        environment.close()
    def test_blogpost_introduction(self):
        """
        Test of introduction blog post examples.
        """
        import tensorflow as tf
        import numpy as np

        ### DQN agent example
        from tensorforce.agents import DQNAgent

        # Network is an ordered list of layers
        network_spec = [dict(type='dense', size=32), dict(type='dense', size=32)]

        # Define a state
        states = dict(shape=(10,), type='float')

        # Define an action
        actions = dict(type='int', num_actions=5)

        agent = DQNAgent(
            states_spec=states,
            actions_spec=actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=10
        )

        agent.close()

        ### Code block: multiple states
        states = dict(
            image=dict(shape=(64, 64, 3), type='float'),
            caption=dict(shape=(20,), type='int')
        )

        # DQN does not support multiple states. Omit test for now.
        # agent = DQNAgent(config=config)

        ### Code block: DQN observer function

        def observe(self, reward, terminal):
            super(DQNAgent, self).observe(reward, terminal)
            if self.timestep >= self.first_update \
                    and self.timestep % self.target_update_frequency == 0:
                self.model.update_target()

        ### Code block: Network config JSON

        network_json = """
        [
            {
                "type": "conv2d",
                "size": 32,
                "window": 8,
                "stride": 4
            },
            {
                "type": "conv2d",
                "size": 64,
                "window": 4,
                "stride": 2
            },
            {
                "type": "flatten"
            },
            {
                "type": "dense",
                "size": 512
            }
        ]
        """

        ### Test json

        import json
        network_spec = json.loads(network_json)

        ### Code block: Modified dense layer

        modified_dense = """
        [
            {
                "type": "dense",
                "size": 64,
                "bias": false,
                "activation": "selu",
                "l2_regularization": 0.001
            }
        ]
        """

        ### Test json
        network_spec = json.loads(modified_dense)

        ### Code block: Own layer type
        from tensorforce.core.networks import Layer

        class BatchNormalization(Layer):

            def __init__(self, variance_epsilon=1e-6, scope='batchnorm', summary_labels=None):
                super(BatchNormalization, self).__init__(scope=scope, summary_labels=summary_labels)
                self.variance_epsilon = variance_epsilon

            def tf_apply(self, x, update):
                mean, variance = tf.nn.moments(x, axes=tuple(range(x.shape.ndims - 1)))
                return tf.nn.batch_normalization(
                    x=x,
                    mean=mean,
                    variance=variance,
                    offset=None,
                    scale=None,
                    variance_epsilon=self.variance_epsilon
                )

        ### Test own layer

        states = dict(shape=(10,), type='float')
        network_spec = [
            {'type': 'dense', 'size': 32},
            {'type': BatchNormalization, 'variance_epsilon': 1e-9}
        ]

        agent = DQNAgent(
            states_spec=states,
            actions_spec=actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8
        )

        agent.close()

        ### Code block: Own network builder
        from tensorforce.core.networks import Network

        class CustomNetwork(Network):

            def tf_apply(self, x, internals, update, return_internals=False):
                image = x['image']  # 64x64x3-dim, float
                caption = x['caption']  # 20-dim, int
                initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01, dtype=tf.float32)

                # CNN
                weights = tf.get_variable(name='W1', shape=(3, 3, 3, 16), initializer=initializer)
                image = tf.nn.conv2d(image, filter=weights, strides=(1, 1, 1, 1), padding='SAME')
                image = tf.nn.relu(image)
                image = tf.nn.max_pool(image, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='SAME')

                weights = tf.get_variable(name='W2', shape=(3, 3, 16, 32), initializer=initializer)
                image = tf.nn.conv2d(image, filter=weights, strides=(1, 1, 1, 1), padding='SAME')
                image = tf.nn.relu(image)
                image = tf.nn.max_pool(image, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1), padding='SAME')

                image = tf.reshape(image, shape=(-1, 16 * 16, 32))
                image = tf.reduce_mean(image, axis=1)

                # LSTM
                weights = tf.get_variable(name='W3', shape=(30, 32), initializer=initializer)
                caption = tf.nn.embedding_lookup(params=weights, ids=caption)
                lstm = tf.contrib.rnn.LSTMCell(num_units=32)
                caption, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=caption, dtype=tf.float32)
                caption = tf.reduce_mean(caption, axis=1)

                # Combination
                if return_internals:
                    return tf.multiply(image, caption), list()
                else:
                    return tf.multiply(image, caption)

        ### Test own network builder

        states = dict(
            image=dict(shape=(64, 64, 3), type='float'),
            caption=dict(shape=(20,), type='int')
        )

        agent = DQNAgent(
            states_spec=states,
            actions_spec=actions,
            network_spec=CustomNetwork,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8
        )

        agent.close()

        ### Code block: LSTM function
        from tensorforce.core.networks import Layer

        class Lstm(Layer):

            def __init__(self, size, scope='lstm', summary_labels=()):
                self.size = size
                super(Lstm, self).__init__(num_internals=1, scope=scope, summary_labels=summary_labels)

            def tf_apply(self, x, update, state):
                state = tf.contrib.rnn.LSTMStateTuple(c=state[:, 0, :], h=state[:, 1, :])
                self.lstm_cell = tf.contrib.rnn.LSTMCell(num_units=self.size)

                x, state = self.lstm_cell(inputs=x, state=state)

                internal_output = tf.stack(values=(state.c, state.h), axis=1)
                return x, (internal_output,)

            def internal_inputs(self):
                return super(Lstm, self).internal_inputs() + [tf.placeholder(dtype=tf.float32, shape=(None, 2, self.size))]

            def internal_inits(self):
                return super(Lstm, self).internal_inits() + [np.zeros(shape=(2, self.size))]

        ### Test LSTM
        states = dict(shape=(10,), type='float')
        network_spec = [
            {'type': 'flatten'},
            {'type': Lstm, 'size': 10}
        ]

        agent = DQNAgent(
            states_spec=states,
            actions_spec=actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8
        )

        agent.close()

        ### Preprocessing configuration
        states = dict(shape=(84, 84, 3), type='float')
        preprocessing = [
            dict(
                type='image_resize',
                width=84,
                height=84
            ),
            dict(
                type='grayscale'
            ),
            dict(
                type='normalize'
            ),
            dict(
                type='sequence',
                length=4
            )
        ]

        ### Test preprocessing configuration

        agent = DQNAgent(
            states_spec=states,
            actions_spec=actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=50,
            preprocessing=preprocessing
        )

        agent.close()

        ### Code block: Continuous action exploration

        exploration = dict(
            type='ornstein_uhlenbeck',
            sigma=0.1,
            mu=0,
            theta=0.1
        )

         ### Test continuous action exploration
        agent = DQNAgent(
            states_spec=states,
            actions_spec=actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            exploration=exploration
        )

        agent.close()

        ### Code block: Discrete action exploration

        exploration = dict(
            type='epsilon_decay',
            initial_epsilon=1.0,
            final_epsilon=0.01,
            timesteps=1e6
        )

        ### Test discrete action exploration
        agent = DQNAgent(
            states_spec=states,
            actions_spec=actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            exploration=exploration
        )

        agent.close()
Esempio n. 26
0
def get_agent(game, agentType):
    count = 1

    base_path = '.'
    checkpointPath = base_path + "/games/agents/" + game + "/" + agentType + "/"

    if agentType == "vpg":
        agent = VPGAgent(
            states=config[game]["states"],
            actions=config[game]["actions"],
            memory=1000,
            network="auto",
        )
    elif agentType == "ppo":
        agent = PPOAgent(
            states=config[game]["states"],
            actions=config[game]["actions"],
            memory=1000,
            network="auto",
        )
    elif agentType == "dqn":
        agent = DQNAgent(
            states=config[game]["states"],
            actions=config[game]["actions"],
            memory=1000,
            network="auto",
        )

    if game == "3pd":
        try:
            agent.restore(directory=checkpointPath, filename=None)
            print("restoration successful")
        except Exception as e:
            agent.initialize()
            for x in tqdm(range(1000001)):
                testState = np.full(config[game]["states"]["shape"], None)

                for i in range(10):
                    moveA = agent.act(testState)
                    moveB = agent.act(testState)
                    moveC = agent.act(testState)
                    rewards = payoffs(game, [moveA, moveB, moveC])
                    if i < 9:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=False)
                        agent.observe(reward=rewards[2], terminal=False)
                    else:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=False)
                        agent.observe(reward=rewards[2], terminal=True)
                    testState[i] = [[moveA], [moveB], [moveC]]
                if x % 1000 == 0:
                    # checkpointPath = "../games/agents/" + game + "/" + agentType + "/"
                    agent.save(directory=checkpointPath, filename=None)
                    # print("saving successful")
    else:
        try:
            agent.restore(directory=checkpointPath, filename=None)
            print("restoration successful")
        except Exception as e:
            # try:
            # 	checkpointPath = base_path + "/agents/" + game + "/" + agentType + "/"
            # 	agent.restore(directory=checkpointPath, filename=None)
            # 	print("restoration successful after second attempt")
            # except Exception as e:
            # 	a = subprocess.check_output("ls games/", shell=True)
            # 	print(a)
            # 	print(os.getcwd(), "vs", subprocess.check_output("pwd", shell=True))
            # 	checkpointPath = "./games/agents/" + game + "/" + agentType + "/"
            # 	print(checkpointPath)
            # 	agent.restore(directory=checkpointPath, filename=None)
            # 	print("restoration successful after third attempt")
            agent.initialize()

            for x in tqdm(range(count)):

                testState = np.full(config[game]["states"]["shape"], 0)

                for i in range(10):
                    moveA = agent.act(testState)
                    moveB = agent.act(testState)
                    rewards = payoffs(game, [moveA, moveB])
                    if i < 10:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=False)
                    else:
                        agent.observe(reward=rewards[0], terminal=False)
                        agent.observe(reward=rewards[1], terminal=True)

                    testState[i] = [[moveA], [moveB]]
            checkpointPath = "./games/agents/" + game + "/" + agentType + "/"
            agent.save(directory=checkpointPath, filename=None)
            print("saving successful")

    return agent
Esempio n. 27
0
    def initialize(self, env):
        from gym import spaces
        from tensorforce.agents import PPOAgent
        from tensorforce.agents import DQNAgent

        if self.algorithm == "ppo":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_actions': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_actions=env.action_space.n)

            return PPOAgent(
                #states=dict(type='float', shape=env.observation_space.shape),
                states=dict(type='float', shape=[11, 11, 18]),
                actions=actions,
                #network=[
                #    dict(type='dense', size=64),
                #    dict(type='dense', size=64)
                #],
                network=[
                    dict(type='conv2d', size=32),
                    dict(type='conv2d', size=32),
                    dict(type='conv2d', size=32),
                    dict(type='conv2d', size=32),
                    dict(type='flatten')
                ],
                batching_capacity=1000,
                step_optimizer=dict(type='adam', learning_rate=1e-4))
            #return PPOAgent(
            #    states=dict(type='float', shape=env.observation_space.shape),
            #    actions=actions,
            #    network=[
            #        dict(type='dense', size=64),
            #        dict(type='dense', size=64)
            #    ],
            #    batching_capacity=1000,
            #    step_optimizer=dict(type='adam', learning_rate=1e-4))
        elif self.algorithm == "dqn":
            if type(env.action_space) == spaces.Tuple:
                actions = {
                    str(num): {
                        'type': int,
                        'num_actions': space.n
                    }
                    for num, space in enumerate(env.action_space.spaces)
                }
            else:
                actions = dict(type='int', num_actions=env.action_space.n)

            return DQNAgent(
                states=dict(type='float', shape=[11, 11, 18]),
                actions=actions,
                discount=0.9,
                double_q_model=False,
                network=[
                    dict(type='conv2d', size=32),
                    dict(type='conv2d', size=32),
                    dict(type='conv2d', size=32),
                    dict(type='conv2d', size=32),
                    dict(type='flatten')
                ],
                batching_capacity=1000,
                optimizer=dict(type='adam', learning_rate=1e-4))
        return None
Esempio n. 28
0
def main(
        mode,  # 'train'  or 'test'
        episode=2000,
        window_size=30,  # agent 브레인이 참고할 이전 타임스텝의 길이
        init_invest=20000,
        model_path=None,
        addition_train=False,
        selected_learn='dqn',  # 'dqn' or 'ppo'
        selected_trading=[],
        selected_subject=[],
        ui_windows=None,  # 현재 띄워진 Ui객체
):
    global gl_ui_window
    gl_ui_window = ui_windows

    set_model_path(model_path if not model_path is None else os.path.
                   join(os.getcwd(), 'model'))
    if not 'model' in os.listdir(os.getcwd()):
        os.makedirs('model')

    # create environment for train and test
    DATA_PATH = '../daily_data'
    environment = create_gold_env(window_size=window_size,
                                  path=DATA_PATH,
                                  train=True if mode == 'train' else False,
                                  selected_trading=selected_trading,
                                  selected_subject=selected_subject,
                                  init_invest=init_invest)

    network_spec = create_network_spec()
    baseline_spec = create_baseline_spec()

    if selected_learn == 'ppo':
        agent = PPOAgent(
            discount=0.9999,
            states=environment.states,
            actions=environment.actions,
            network=network_spec,
            # Agent
            states_preprocessing=None,
            actions_exploration=None,
            reward_preprocessing=None,
            # MemoryModel
            update_mode=dict(
                unit='timesteps',  #'episodes',
                # 10 episodes per update
                batch_size=32,
                # # Every 10 episodes
                frequency=10),
            memory=dict(type='latest',
                        include_next_states=False,
                        capacity=50000),
            # DistributionModel
            distributions=None,
            entropy_regularization=0.0,  # None
            # PGModel
            baseline_mode='states',
            baseline=dict(type='custom', network=baseline_spec),
            baseline_optimizer=dict(
                type='multi_step',
                optimizer=dict(
                    type='adam',
                    learning_rate=(1e-4)  # 3e-4
                ),
                num_steps=5),
            gae_lambda=0,  # 0
            # PGLRModel
            likelihood_ratio_clipping=0.2,
            # PPOAgent
            step_optimizer=dict(
                type='adam',
                learning_rate=(1e-4)  # 1e-4
            ),
            subsampling_fraction=0.2,  # 0.1
            optimization_steps=10,
            execution=dict(type='single',
                           session_config=None,
                           distributed_spec=None))
    else:  # learn_model=='dqn' or etc.
        agent = DQNAgent(
            states=environment.states,
            actions=environment.actions,
            network=[
                dict(type='flatten'),
                dict(type='dense', size=32, activation='relu'),
                dict(type='dense', size=32, activation='relu'),
            ],
        )

    if mode == 'test' or addition_train == True:
        if len(
            [elem for elem in os.listdir(LOAD_DIR) if 'trading_model' in elem
             ]) >= 3:
            agent.restore_model(LOAD_DIR)
            print('loaded')
        elif mode == 'test':
            ui_windows.setInfo(msg="로딩할 트레이딩모델이 존재하지 않는 것으로 보입니다.")
            return

    runner = Runner(agent=agent, environment=environment)
    if mode == 'train':
        kwargs = dict(episodes=episode,
                      max_episode_timesteps=16000,
                      episode_finished=episode_finished)
    else:  # mode=='test'
        kwargs = dict(num_episodes=episode,
                      deterministic=True,
                      testing=True,
                      episode_finished=print_simple_log)
    runner.run(**kwargs)

    # TODO TFTraderEnv에 에피소드마다의 포트폴리오 결과치 저장해야함. UI에 매순간 데이터 설정하기.
    # setResult(????)
    msg = "{mode} finished. Total episodes: {ep}. \nAverage reward of last 100 episodes: {ar}.".format(
        mode="Training" if mode == 'train' else "Testing",
        ep=runner.episode,
        ar=np.mean(runner.episode_rewards[-100:]))
    print(msg)
    ui_windows.setInfo(msg=msg)
Esempio n. 29
0
def main(max_timesteps, learning_rate):
    max_episodes = None
    #max_timesteps = 86400000000*days

    network_spec = [
        #dict(type='flatten'),
        dict(type='dense', size=11, activation='tanh'),
        #dict(type='dense', size=20, activation='tanh'),
        #dict(type='dense', size=32, activation='tanh'),
    ]

    exploration = dict(type='epsilon_decay', timesteps=max_timesteps)

    summarizer = dict(
        directory="./models/" + str(datetime.now()).replace(' ', ''),
        steps=10000,
        seconds=None,
        labels=[
            #'rewards',
            #'actions',
            'inputs',
            'gradients',
            'configuration',
        ],
        meta_dict=dict(
            description='July 2: Trying 11 node hidden layer.',
            layers=str(network_spec),
            timesteps=max_timesteps,
            exploration=exploration,
        ),
    )

    agent = DQNAgent(states=env.states,
                     actions=env.actions,
                     network=network_spec,
                     actions_exploration=exploration,
                     optimizer=dict(type='adam', learning_rate=learning_rate)
                     #summarizer=summarizer,
                     #batch_size=64
                     )

    runner = Runner(agent, env)

    report_episodes = 1

    #global prev
    global prev
    prev = 0

    def episode_finished(r):
        global prev
        if r.episode % report_episodes == 0:
            #print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep-prev))
            #print("Episode reward: {}".format(r.episode_rewards[-1]))
            print(r.episode_rewards[-1])
        prev = r.timestep
        #print("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    print("Starting {agent} for Environment '{env}'".format(agent=agent,
                                                            env=env))

    runner.run(num_episodes=max_episodes,
               num_timesteps=max_timesteps,
               max_episode_timesteps=None,
               episode_finished=episode_finished)

    agent.save_model(directory='./results/DeepQ/' +
                     str(datetime.now()).replace(' ', '') + '/model')

    runner.close()

    print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
            "up": dict(type="float", min_value=0.0, max_value=1.0),
            "down": dict(type="float", min_value=0.0, max_value=1.0),
            "left": dict(type="float", min_value=0.0, max_value=1.0),
            "right": dict(type="float", min_value=0.0, max_value=1.0),
        },
        network='auto',
        memory=25000,
    )
elif args.agent == "dqn":  # Deep Q-Learning
    agent = DQNAgent(
        states={
            "type": 'float',
            "shape": (1, 613)
        },
        actions={
            "up": dict(type="float", min_value=0.0, max_value=1.0),
            "down": dict(type="float", min_value=0.0, max_value=1.0),
            "left": dict(type="float", min_value=0.0, max_value=1.0),
            "right": dict(type="float", min_value=0.0, max_value=1.0),
        },
        network='auto',
        memory=10000,
    )

elif args.agent == "vpg":  # Vanilla Policy Gradient
    agent = VPGAgent(
        states={
            "type": 'float',
            "shape": (1, 610)
        },
        actions={
            "up": dict(type="float", min_value=0.0, max_value=1.0),