Esempio n. 1
0
    def test_introduction_dqnagent(self):
        from tensorforce import Configuration
        from tensorforce.agents import DQNAgent
        from tensorforce.core.networks import layered_network_builder

        # Define a network builder from an ordered list of layers
        layers = [dict(type='dense', size=32), dict(type='dense', size=32)]
        network = layered_network_builder(layers_config=layers)

        # Define a state
        states = dict(shape=(10, ), type='float')

        # Define an action (models internally assert whether
        # they support continuous and/or discrete control)
        actions = dict(continuous=False, num_actions=5)

        # The agent is configured with a single configuration object
        agent_config = Configuration(batch_size=8,
                                     learning_rate=0.001,
                                     memory_capacity=800,
                                     first_update=80,
                                     repeat_update=4,
                                     target_update_frequency=20,
                                     states=states,
                                     actions=actions,
                                     network=network)
        agent = DQNAgent(config=agent_config)
Esempio n. 2
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   keep_last=True,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = DQNNstepAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQN Nstep agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQN Nstep agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 3
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(
                batch_size=8,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:],
                                                                                            r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('TRPO agent (continuous): ' + str(runner.episode))

            if runner.episode < 1000:
                passed += 1

        print('TRPO agent (continuous) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 4
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.0005,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   memory=dict(type='replay',
                                               random_sampling=True),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = CategoricalDQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('Categorical DQN agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('Categorical DQN agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def test_beta(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            actions = environment.actions
            actions['min_value'] = -0.5
            actions['max_value'] = 1.5

            config = Configuration(batch_size=8,
                                   learning_rate=0.01,
                                   states=environment.states,
                                   actions=actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1500, episode_finished=episode_finished)
            print('VPG agent (beta): ' + str(runner.episode))
            if runner.episode < 1500:
                passed += 1

        print('VPG agent (beta) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 6
0
    def test_reinforceio_homepage(self):
        """
        Code example from the homepage and README.md.
        """

        from tensorforce import Configuration
        from tensorforce.agents import TRPOAgent
        from tensorforce.core.networks import layered_network_builder

        config = Configuration(
            batch_size=100,
            states=dict(shape=(10,), type='float'),
            actions=dict(continuous=False, num_actions=2),
            network=layered_network_builder([dict(type='dense', size=50), dict(type='dense', size=50)])
        )

        # Create a Trust Region Policy Optimization agent
        agent = TRPOAgent(config=config)

        # Get new data from somewhere, e.g. a client to a web app
        client = TestTutorialCode.MyClient('http://127.0.0.1', 8080)

        # Poll new state from client
        state = client.get_state()

        # Get prediction from agent, execute
        action = agent.act(state=state)
        reward = client.execute(action)

        # Add experience, agent automatically updates model according to batch size
        agent.observe(reward=reward, terminal=False)
    def test_baseline(self):
        config = Configuration(discount=0.75,
                               batch_size=8,
                               learning_rate=0.001,
                               states=dict(shape=(1, )),
                               actions=dict(continuous=True),
                               network=layered_network_builder(()))
        agent = VPGAgent(config=config)

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        discounted_rewards = np.array([
            0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2,
            0.75, 1.0, 0.0
        ])
        baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0])
        agent.model.baseline = dict(state=Baseline())
        agent.model.baseline['state'].predict = lambda states: baseline

        result, _ = agent.model.reward_estimation(states=dict(state=states),
                                                  rewards=rewards,
                                                  terminals=terminals)
        expected = discounted_rewards - baseline
        print(result)
        print(expected)
        self.assertTrue((result == expected).all())
Esempio n. 8
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
            )
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQN Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQN Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 9
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(
                batch_size=20,
                entropy_penalty=0.01,
                loss_clipping=0.1,
                epochs=10,
                optimizer_batch_size=10,
                learning_rate=0.0005,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32),
                    dict(type='dense', size=32)
                ])
            )
            agent = PPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:],
                                                                                            r.episode_lengths[-100:]))

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('PPO agent (continuous): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('PPO agent (continuous) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 10
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                cg_iterations=20,
                cg_damping=0.001,
                line_search_steps=20,
                max_kl_divergence=0.05,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (continuous): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO continuous agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 11
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(batch_size=8,
                                   cg_iterations=20,
                                   cg_damping=0.001,
                                   line_search_steps=20,
                                   max_kl_divergence=0.05,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=10000, episode_finished=episode_finished)
            print('TRPO Agent (continuous): ' + str(runner.episode))

            if runner.episode < 10000:
                passed += 1
                print('passed')
            else:
                print('failed')

        print('TRPO continuous agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 12
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   repeat_update=4,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=5000, episode_finished=episode_finished)
            print('DQN Agent: ' + str(runner.episode))
            if runner.episode < 5000:
                passed += 1
                print('passed')
            else:
                print('failed')

        print('DQN Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 13
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('VPG Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('VPG discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 14
0
    def create_tf_operations(self, config):
        if len(config.states) > 1:
            raise Exception()

        with tf.variable_scope('mlp_value_function'):
            self.state = tf.placeholder(
                dtype=tf.float32,
                shape=(None, util.prod(next(iter(config.states))[1].shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))

            network_builder = layered_network_builder(({
                'type': 'dense',
                'size': self.size
            }, {
                'type': 'dense',
                'size': 1
            }))

            network = NeuralNetwork(network_builder=network_builder,
                                    inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=config.learning_rate)
            self.optimize = optimizer.minimize(loss)
Esempio n. 15
0
    def create_tf_operations(self, state, batch_size, scope='cnn_baseline'):

        with tf.variable_scope(scope):
            self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None,))
            self.updates = int(batch_size / self.update_batch_size) * self.epochs
            self.batch_size = batch_size

            layers = []
            for size in self.sizes:
                layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3})

            # First layer has larger window
            layers[0]['window'] = 5

            # TODO append maxpooling
            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(network_builder=layered_network_builder(layers),
                                    inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

            self.optimize = optimizer.minimize(loss)
Esempio n. 16
0
    def create_tf_operations(self, state, scope='cnn_baseline'):
        with tf.variable_scope(scope) as scope:
            self.state = tf.placeholder(dtype=tf.float32, shape=(None,) + tuple(state.shape))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None,))

            layers = []
            for size in self.cnn_sizes:
                layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3})

            # First layer has larger window
            layers[0]['window'] = 5
            layers.append({'type': 'flatten'})
            for size in self.dense_sizes:
                layers.append({'type': 'dense', 'size': size})
            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(network_builder=layered_network_builder(layers),
                                    inputs=dict(state=self.state))

            self.prediction = tf.squeeze(input=network.output, axis=1)
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

            variables = tf.contrib.framework.get_variables(scope=scope)
            self.optimize = optimizer.minimize(loss, var_list=variables)
Esempio n. 17
0
    def create_tf_operations(self, config):
        if len(config.states) > 1:
            raise Exception()

        with tf.variable_scope('mlp_value_function'):
            self.state = tf.placeholder(
                dtype=tf.float32,
                shape=(None, util.prod(next(iter(config.states))[1].shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))
            self.updates = int(
                config.batch_size / self.update_batch_size) * self.epochs
            self.batch_size = config.batch_size

            layers = []
            for _ in xrange(self.hidden_layers):
                layers.append({'type': 'dense', 'size': self.size})
            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(
                network_builder=layered_network_builder(layers),
                inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=config.learning_rate)

            self.optimize = optimizer.minimize(loss)
Esempio n. 18
0
    def create_tf_operations(self, state, batch_size, scope='mlp_baseline'):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(dtype=tf.float32,
                                        shape=(None, util.prod(state.shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))
            self.updates = int(
                batch_size / self.update_batch_size) * self.epochs
            self.batch_size = batch_size

            layers = []
            for size in self.sizes:
                layers.append({'type': 'dense', 'size': size})

            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(
                network_builder=layered_network_builder(layers),
                inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)

            self.optimize = optimizer.minimize(loss)
Esempio n. 19
0
    def test_discrete(self):
        passed = 0

        # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis
        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.0001,
                cg_iterations=20,
                cg_damping=0.001,
                line_search_steps=20,
                max_kl_divergence=0.05,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 20
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   exploration=dict(type='ornstein_uhlenbeck'),
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('NAF agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('NAF agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('VPG agent (continuous): ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('VPG agent (continuous) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 22
0
    def test_discrete(self):
        passed = 0

        # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis
        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.0001,
                                   cg_iterations=20,
                                   cg_damping=0.001,
                                   line_search_steps=20,
                                   max_kl_divergence=0.05,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 23
0
    def test_replay(self):
        environment = MinimalTest(definition=[(False, (1, 2))])
        config = Configuration(batch_size=8,
                               learning_rate=0.001,
                               memory_capacity=50,
                               memory=dict(type='replay',
                                           random_sampling=True),
                               first_update=20,
                               target_update_frequency=10,
                               states=environment.states,
                               actions=environment.actions,
                               network=layered_network_builder([
                                   dict(type='dense', size=32),
                                   dict(type='dense', size=32)
                               ]))
        agent = DQNAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(
                x / l >= reward_threshold for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Replay memory DQN: ' + str(runner.episode))
Esempio n. 24
0
    def test_discrete_baseline(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   baseline=dict(type="mlp",
                                                 sizes=[32, 32],
                                                 epochs=5,
                                                 update_batch_size=8,
                                                 learning_rate=0.01),
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1500, episode_finished=episode_finished)
            print('VPG agent (discrete): ' + str(runner.episode))

            if runner.episode < 1500:
                passed += 1

        print('VPG agent (discrete) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 25
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   demo_memory_capacity=100,
                                   demo_sampling_ratio=0.2,
                                   memory=dict(type='replay',
                                               random_sampling=True),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 26
0
    def test_dqfd_agent(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=16,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                demo_memory_capacity=100,
                demo_sampling_ratio=0.1,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder(layers_config=[
                    dict(type='dense', size=32, l2_regularization=0.0001)
                ]))
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 27
0
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=256,

                memory=dict(
                    type='prioritized_replay',
                ),
                update_frequency=256,
                first_update=512,

                learning_rate=0.0001,
                optimizer_batch_size=64,
                normalize_rewards=False,
                gae_rewards=False,
                baseline=dict(
                    type="mlp",
                    sizes=[32, 32],
                    epochs=1,
                    update_batch_size=64,
                    learning_rate=0.001
                ),
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
Esempio n. 28
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                exploration=dict(type='ornstein_uhlenbeck'),
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                clip_gradients=1.0,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
                # batch_size=8,
                # learning_rate=0.0025,
                # # exploration="OrnsteinUhlenbeckProcess",
                # # exploration_kwargs=dict(
                # #     sigma=0.1,
                # #     mu=0,
                # #     theta=0.1
                # # ),
                # discount=0.99,
                # memory_capacity=800,
                # first_update=80,
                # repeat_update=4,
                # target_update_frequency=20,
                # states=environment.states,
                # actions=environment.actions,
                # clip_gradients=5.0,
                # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)])
            )
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('NAF Agent: ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('NAF Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 3)
Esempio n. 29
0
    def test_dqfd_agent(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=16,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                demo_memory_capacity=100,
                demo_sampling_ratio=0.1,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder(layers_config=[dict(type='dense', size=32, l2_regularization=0.0001)])
            )
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Esempio n. 30
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                exploration=dict(type='ornstein_uhlenbeck'),
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                clip_gradients=1.0,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
                # batch_size=8,
                # learning_rate=0.0025,
                # # exploration="OrnsteinUhlenbeckProcess",
                # # exploration_kwargs=dict(
                # #     sigma=0.1,
                # #     mu=0,
                # #     theta=0.1
                # # ),
                # discount=0.99,
                # memory_capacity=800,
                # first_update=80,
                # repeat_update=4,
                # target_update_frequency=20,
                # states=environment.states,
                # actions=environment.actions,
                # clip_gradients=5.0,
                # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)])
            )
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('NAF Agent: ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('NAF Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 3)
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = TRPOAgent(config=Configuration(
                log_level='info',
                batch_size=100,
                baseline=dict(
                    type='mlp',
                    size=32,
                    hidden_layers=1,
                    epochs=20,
                    update_batch_size=32
                ),
                generalized_advantage_estimation=True,
                normalize_advantage=False,
                gae_lambda=0.97,
                max_kl_divergence=0.005,
                cg_iterations=20,
                cg_damping=0.01,
                ls_max_backtracks=20,
                ls_override=False,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = TRPOAgent(config=Configuration(
                loglevel='info',
                batch_size=100,
                baseline='mlp',
                baseline_args=None,
                baseline_kwargs=dict(
                    size=32,
                    repeat_update=100
                ),
                override_line_search=False,
                generalized_advantage_estimation=True,
                normalize_advantage=False,
                gae_lambda=0.97,
                cg_iterations=20,
                cg_damping=0.01,
                line_search_steps=20,
                max_kl_divergence=0.005,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
Esempio n. 33
0
    def create_tf_operations(self, config):
        if len(config.states) > 1:
            raise Exception()

        with tf.variable_scope('mlp_value_function'):
            self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None,))

            network_builder = layered_network_builder((
                {'type': 'dense', 'size': self.size},
                {'type': 'dense', 'size': 1})
            )

            network = NeuralNetwork(network_builder=network_builder, inputs=dict(state=self.state))

            self.prediction = network.output
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)
            self.optimize = optimizer.minimize(loss)
    def test_gae(self):
        config = Configuration(discount=0.75,
                               batch_size=8,
                               learning_rate=0.001,
                               gae_rewards=True,
                               gae_lambda=0.5,
                               states=dict(shape=(1, )),
                               actions=dict(continuous=True),
                               network=layered_network_builder(()))
        agent = VPGAgent(config=config)

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0])
        agent.model.baseline = dict(state=Baseline())
        agent.model.baseline['state'].predict = lambda states: baseline
        td_residuals = np.array([
            0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.75 * 0.25, 0.75 * 0.5 - 0.25, 1.0,
            1.0 + 0.75 * 0.25 - 0.5, 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.0
        ])

        result, _ = agent.model.reward_estimation(states=dict(state=states),
                                                  rewards=rewards,
                                                  terminals=terminals)
        expected = np.array([
            np.sum(
                ((0.5 * 0.75)**np.array([0, 1, 2, 3, 4])) * td_residuals[:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[1:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[2:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[3:5]),
            np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[4:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[5:]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[6:]),
            np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[7:]),
            np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[8:])
        ])
        self.assertTrue((result == expected).all())
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=4096,
                gae_lambda=0.97,
                learning_rate=0.001,
                entropy_penalty=0.01,
                epochs=5,
                optimizer_batch_size=512,
                loss_clipping=0.2,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000,
                       max_timesteps=200,
                       episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
    def test_prioritized_replay(self):
        environment = MinimalTest(definition=[(False, (1, 2))])
        config = Configuration(batch_size=8,
                               learning_rate=0.001,
                               memory_capacity=50,
                               memory='prioritized_replay',
                               first_update=20,
                               target_update_frequency=10,
                               states=environment.states,
                               actions=environment.actions,
                               network=layered_network_builder([
                                   dict(type='dense', size=32),
                                   dict(type='dense', size=32)
                               ]))
        agent = DQNAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(
                x >= 1.0 for x in r.episode_rewards[-100:])

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Prioritized replay memory DQN: ' + str(runner.episode))
Esempio n. 37
0
    def create_tf_operations(self, state, scope='mlp_baseline'):
        with tf.variable_scope(scope) as scope:
            self.state = tf.placeholder(dtype=tf.float32,
                                        shape=(None, util.prod(state.shape)))
            self.returns = tf.placeholder(dtype=tf.float32, shape=(None, ))

            layers = []
            for size in self.sizes:
                layers.append({'type': 'dense', 'size': size})

            layers.append({'type': 'linear', 'size': 1})

            network = NeuralNetwork(
                network_builder=layered_network_builder(layers),
                inputs=dict(state=self.state))

            self.prediction = tf.squeeze(input=network.output, axis=1)
            loss = tf.nn.l2_loss(self.prediction - self.returns)

            optimizer = tf.train.AdamOptimizer(
                learning_rate=self.learning_rate)

            variables = tf.contrib.framework.get_variables(scope=scope)
            self.optimize = optimizer.minimize(loss, var_list=variables)
Esempio n. 38
0
        repeat_update=100
    ),
    override_line_search=False,
    generalized_advantage_estimation=True,
    normalize_advantage=False,
    gae_lambda=0.97,
    cg_iterations=20,
    cg_damping=0.01,
    line_search_steps=20,
    max_kl_divergence=0.005,
    gamma=0.97,
    continuous=False,
    preprocessing=None,
    states=env.states,
    actions=env.actions,
    network=layered_network_builder([dict(type='dense', size=32, activation='tanh'),
                                     dict(type='dense', size=32, activation='tanh')])
))

# Create the runner
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):
    print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.timestep,
                                                                                 reward=r.episode_rewards[-1]))
    return True


# Start learning
runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished)
Esempio n. 39
0
from tensorforce import Configuration
from tensorforce.agents import TRPOAgent
from tensorforce.environments.openai_gym import OpenAIGym
from tensorforce.execution import Runner
from tensorforce.core.networks import layered_network_builder

import numpy as np

# Create an OpenAIgym environment
env = OpenAIGym('CartPole-v0')

# Create a Trust Region Policy Optimization agent
agent = TRPOAgent(config=Configuration(batch_size=200,
                                       states=env.states,
                                       actions=env.actions,
                                       network=layered_network_builder(
                                           [dict(type='dense', size=10)])))

# Create the runner
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):
    print(
        "Finished episode {ep} after {ts} timesteps (reward: {reward})".format(
            ep=r.episode, ts=r.timestep, reward=r.episode_rewards[-1]))
    return True


# Start learning
runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished)