コード例 #1
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(
                batch_size=20,
                entropy_penalty=0.01,
                loss_clipping=0.1,
                epochs=10,
                optimizer_batch_size=10,
                learning_rate=0.0005,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32),
                    dict(type='dense', size=32)
                ])
            )
            agent = PPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:],
                                                                                            r.episode_lengths[-100:]))

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('PPO agent (continuous): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('PPO agent (continuous) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #2
0
    def test_multi(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            return state0 * state1

        for _ in xrange(5):
            environment = MinimalTest(definition=[False, (False, 2)])
            config = Configuration(batch_size=8,
                                   keep_last=True,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = DQNNstepAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 15 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-15:], r.episode_lengths[-15:]))

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('DQN Nstep agent (multi-state/action): ' +
                  str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print(
            'DQN Nstep agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
コード例 #3
0
ファイル: run.py プロジェクト: schlesingerphilipp/trader
def train(config, network_spec=None):
    data_provider = DataProvider(config.db)
    env = StockEnvironment(data_provider, config, 0)
    agent = overwrite_agent(env, network_spec,
                            config) if config.overwrite_agent else load_agent(
                                config, env, network_spec)

    mlflow.log_param("agent", "tensorforce.agents.DQNAgent")
    for key in config.agent_specs:
        mlflow.log_param(key, config.agent_specs[key])

    runner = Runner(agent=agent, environment=env)
    offset = 20000
    num_episodes = 20
    step = 0
    while data_provider.has_data_key(offset + config.max_step_per_episode):
        runner.run(num_episodes=num_episodes)
        offset = offset + config.max_step_per_episode
        env.offset = offset
        agent.save(config.agent_dir, config.agent_name)
        if step % 10 == 0:
            evaluate(config, data_provider,
                     offset - config.max_step_per_episode, agent)
        step += 1
    return agent, env
コード例 #4
0
    def test_quickstart(self):
        sys.stdout.write('\nQuickstart:\n')
        sys.stdout.flush()

        # Create an OpenAI-Gym environment
        environment = OpenAIGym('CartPole-v1')

        # Create the agent
        agent = PPOAgent(
            states=environment.states(),
            actions=environment.actions(),
            # Automatically configured network
            network='auto',
            # Memory sampling most recent experiences, with a capacity of 2500 timesteps
            # (6100 > [30 batch episodes] * [200 max timesteps per episode])
            memory=6100,
            # Update every 10 episodes, with a batch of 30 episodes
            update_mode=dict(unit='episodes', batch_size=30, frequency=10),
            # PPO optimizer
            step_optimizer=dict(type='adam', learning_rate=1e-3),
            # PPO multi-step optimization: 10 updates, each based on a third of the batch
            subsampling_fraction=0.33,
            optimization_steps=10,
            # MLP baseline
            baseline_mode='states',
            baseline=dict(type='network', network='auto'),
            # Baseline optimizer
            baseline_optimizer=dict(type='multi_step',
                                    optimizer=dict(type='adam',
                                                   learning_rate=1e-4),
                                    num_steps=5),
            # Other parameters
            discount=0.99,
            entropy_regularization=1e-2,
            gae_lambda=None,
            likelihood_ratio_clipping=0.2)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Function handle called after each finished episode
        def callback(r):
            return float(np.mean(r.episode_rewards[-100:])) <= 180.0

        # Start the runner
        runner.run(num_episodes=1000,
                   max_episode_timesteps=200,
                   callback=callback)
        runner.close()

        if float(np.mean(runner.episode_rewards[-100:])) <= 180.0:
            sys.stdout.write('Test failed, exceeding {} episodes\n'.format(
                runner.episode))
            sys.stdout.flush()
            self.assertTrue(expr=False)
        else:
            sys.stdout.write('Test passed after {} episodes\n'.format(
                runner.episode))
            sys.stdout.flush()
            self.assertTrue(expr=True)
コード例 #5
0
ファイル: test_vpg_agent.py プロジェクト: et0803/tensorforce
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('VPG Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('VPG discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #6
0
    def test_multi(self):
        passed = 0

        def network_builder(inputs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            state2 = layer(x=layer(x=inputs['state2'], size=32), size=32)
            state3 = layer(x=layer(x=inputs['state3'], size=32), size=32)
            return state0 * state1 * state2 * state3

        for _ in xrange(5):
            environment = MinimalTest(definition=[
                False, (False, 2), (False, (1, 2)), (True, (1, 2))
            ])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 50 or not all(
                    x >= 1.0 for x in r.episode_rewards[-50:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('VPG agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('VPG agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #7
0
    def test_save_restore(self):
        environment_spec = {"float": ()}
        environment = create_environment(environment_spec)
        network_spec = [dict(type='dense', size=32)]
        agent = create_agent(environment, network_spec)
        runner = Runner(agent=agent, environment=environment)

        runner.run(episodes=100)
        model_values = agent.model.session.run(
            agent.model.get_variables(include_submodules=True,
                                      include_nontrainable=False))
        save_path = agent.model.save(directory=self._tmp_dir_path + "/model")
        print("Saved at: %s" % (save_path, ))
        runner.close()

        agent = create_agent(environment, network_spec)
        agent.model.restore(directory="", file=save_path)
        restored_model_values = agent.model.session.run(
            agent.model.get_variables(include_submodules=True,
                                      include_nontrainable=False))
        assert len(model_values) == len(restored_model_values)
        assert all([
            np.array_equal(v1, v2)
            for v1, v2 in zip(model_values, restored_model_values)
        ])

        agent.close()
コード例 #8
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(batch_size=8,
                                   cg_iterations=20,
                                   cg_damping=0.001,
                                   line_search_steps=20,
                                   max_kl_divergence=0.05,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=10000, episode_finished=episode_finished)
            print('TRPO Agent (continuous): ' + str(runner.episode))

            if runner.episode < 10000:
                passed += 1
                print('passed')
            else:
                print('failed')

        print('TRPO continuous agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #9
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.0005,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   memory=dict(type='replay',
                                               random_sampling=True),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = CategoricalDQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('Categorical DQN agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('Categorical DQN agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #10
0
def main():
    env, agent = set_up()
    runner = Runner(agent=agent, environment=env)
    runner.run(num_episodes=10000)
    agent.save(directory="saved_models")
    agent.close()
    env.close()
コード例 #11
0
    def train_and_test(self, agent, early_stop=-1, n_tests=15):
        n_train = TIMESTEPS // n_tests
        i = 0
        runner = Runner(agent=agent, environment=self)

        try:
            while i <= n_tests:
                self.use_dataset(Mode.TRAIN)
                runner.run(timesteps=n_train, max_episode_timesteps=n_train)
                self.use_dataset(Mode.TEST)
                self.run_deterministic(runner, print_results=True)
                if early_stop > 0:
                    advantages = np.array(
                        self.acc.episode.advantages[-early_stop:])
                    if i >= early_stop and np.all(advantages > 0):
                        i = n_tests
                i += 1
        except KeyboardInterrupt:
            # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're
            # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_tests
            # are the more methodical approaches)
            pass

        # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance)
        print('Running no-kill test-set')
        self.use_dataset(Mode.TEST, no_kill=True)
        self.run_deterministic(runner, print_results=True)
コード例 #12
0
ファイル: test_trpo_agent.py プロジェクト: et0803/tensorforce
    def test_discrete(self):
        passed = 0

        # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis
        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.0001,
                cg_iterations=20,
                cg_damping=0.001,
                line_search_steps=20,
                max_kl_divergence=0.05,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #13
0
    def test_replay(self):
        environment = MinimalTest(definition=[(False, (1, 2))])
        config = Configuration(batch_size=8,
                               learning_rate=0.001,
                               memory_capacity=50,
                               memory=dict(type='replay',
                                           random_sampling=True),
                               first_update=20,
                               target_update_frequency=10,
                               states=environment.states,
                               actions=environment.actions,
                               network=layered_network_builder([
                                   dict(type='dense', size=32),
                                   dict(type='dense', size=32)
                               ]))
        agent = DQNAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(
                x / l >= reward_threshold for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Replay memory DQN: ' + str(runner.episode))
コード例 #14
0
def main():
    bad_seeds_environment, agent = set_up()
    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=10000)
    agent.save(directory="saved_models")
    bad_seeds_environment.close()
    agent.close()
コード例 #15
0
ファイル: test_vpg_agent.py プロジェクト: ddfan/tensorforce
    def test_discrete_baseline(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   baseline=dict(type="mlp",
                                                 sizes=[32, 32],
                                                 epochs=5,
                                                 update_batch_size=8,
                                                 learning_rate=0.01),
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1500, episode_finished=episode_finished)
            print('VPG agent (discrete): ' + str(runner.episode))

            if runner.episode < 1500:
                passed += 1

        print('VPG agent (discrete) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #16
0
    def test_multi(self):
        passed = 0

        def network_builder(inputs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            return state0 * state1

        for _ in xrange(5):
            environment = MinimalTest(definition=[False, (False, 2)])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = CategoricalDQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 15 or not all(
                    x >= 1.0 for x in r.episode_rewards[-15:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('Categorical DQN agent (multi-state/action): ' +
                  str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('Categorical DQN agent (multi-state/action) passed = {}'.format(
            passed))
        self.assertTrue(passed >= 2)
コード例 #17
0
    def test_beta(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            actions = environment.actions
            actions['min_value'] = -0.5
            actions['max_value'] = 1.5

            config = Configuration(batch_size=8,
                                   learning_rate=0.01,
                                   states=environment.states,
                                   actions=actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                    r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1500, episode_finished=episode_finished)
            print('VPG agent (beta): ' + str(runner.episode))
            if runner.episode < 1500:
                passed += 1

        print('VPG agent (beta) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #18
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('VPG agent (continuous): ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('VPG agent (continuous) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #19
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   exploration=dict(type='ornstein_uhlenbeck'),
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('NAF agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('NAF agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #20
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   keep_last=True,
                                   learning_rate=0.001,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = DQNNstepAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQN Nstep agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQN Nstep agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #21
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(
                batch_size=8,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x / l >= reward_threshold for x, l in zip(r.episode_rewards[-100:],
                                                                                            r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('TRPO agent (continuous): ' + str(runner.episode))

            if runner.episode < 1000:
                passed += 1

        print('TRPO agent (continuous) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #22
0
def main(
        time_limit=None,
        scoring="default",
        batch_size=16,
        gpu_idx=0,
        env_version=1,
        seed_count=9,
        max_count=10,
        out_path=None,
        num_episodes=int(3 * 10**3),
):
    env, agent = set_up(
        time_limit=time_limit,
        scoring=scoring,
        batch_size=batch_size,
        gpu_idx=gpu_idx,
        env_version=env_version,
        seed_count=seed_count,
        max_count=max_count,
        out_path=out_path,
    )

    runner = Runner(agent=agent, environment=env)
    runner.run(num_episodes=num_episodes)
    if out_path is None:
        out_path = Path()
    else:
        out_path = Path(out_path).expanduser()
    agent.save(directory=str(out_path / "saved_models"))
    agent.close()
    env.close()
コード例 #23
0
ファイル: test_dqn_agent.py プロジェクト: et0803/tensorforce
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
            )
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQN Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQN Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #24
0
def main():
    tensorflow_settings()
    bad_seeds_environment = Environment.create(
        environment=BadSeeds02,
        seed_count=10,
        bad_seed_count=3,
        history_block=2,
        max_episode_timesteps=100,
    )

    agent = Agent.create(
        agent="random",
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_random_env_02/summaries",
            labels="all",
            frequency=100,  # store values every 100 timesteps
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=10000)

    bad_seeds_environment.close()
    agent.close()
コード例 #25
0
def load_agent(
    time_limit=None,
    scoring="default",
    batch_size=16,
    gpu_idx=0,
    env_version=1,
    seed_count=9,
    max_count=10,
    out_path=None,
):
    env, agent = set_up(
        time_limit=time_limit,
        scoring=scoring,
        batch_size=batch_size,
        gpu_idx=gpu_idx,
        env_version=env_version,
        seed_count=seed_count,
        max_count=max_count,
        out_path=out_path,
    )
    if out_path is None:
        out_path = Path()
    else:
        out_path = Path(out_path).expanduser()
    agent.restore(directory=str(out_path / "saved_models"))
    runner = Runner(agent=agent, environment=env)
    runner.run(num_episodes=20)
    return agent
コード例 #26
0
class Player:
    """Mandatory class with the player methods"""
    def __init__(self, name='ppo_agent', load_model=None, env=None):
        """Initialization of an agent"""
        self.equity_alive = 0
        self.actions = []
        self.last_action_in_stage = ''
        self.temp_stack = []
        self.name = name
        self.autoplay = True

        self.ppo_agent = None
        self.poker_env = Environment.create(environment=env,
                                            max_episode_timesteps=100)
        self.runner = None

        if load_model:
            self.load(load_model)

    def load(self, model_name):
        print("Loading model...")
        self.ppo_agent = Agent.load(directory=model_name, format='hdf5')

    def start_step_policy(self, observation):
        log.info("Random action")
        _ = observation
        action = self.poker_env.action_space.sample()
        return action

    def train(self, model_name, num_ep=500):

        print('Training...')
        self.runner = Runner(agent='ppo.json',
                             environment=dict(type=self.poker_env),
                             num_parallel=5,
                             remote='multiprocessing')
        self.runner.run(num_episodes=num_ep)
        self.runner.agent.save(directory=model_name, format='hdf5')
        self.runner.close()

    def play(self, model_name, num_ep=5):
        self.load(model_name)

        print('Evaluating...')
        self.runner = Runner(agent=self.ppo_agent,
                             environment=dict(type=self.poker_env))
        self.runner.run(num_episodes=num_ep, evaluation=True)
        self.runner.close()

    def action(self, action_space, observation, info):
        _ = observation
        _ = info

        this_player_action_space = {
            Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT,
            Action.RAISE_HALF_POT, Action.RAISE_2POT
        }
        action = this_player_action_space.intersection(set(action_space))

        return action
コード例 #27
0
    def test_runner_evaluation(self):
        states = dict(type='float', shape=(1,))

        actions = dict(type='int', shape=(), num_values=3)

        agent, environment = self.prepare(name='runner-evaluation', states=states, actions=actions)

        runner = Runner(agent=agent, environment=environment)

        self.num_evaluations = 0
        evaluation_frequency = 3
        max_evaluation_timesteps = 2
        num_evaluation_iterations = 2

        def evaluation_callback(r):
            self.num_evaluations += 1
            self.assertEqual(r.episode, self.num_evaluations * evaluation_frequency)
            self.assertEqual(len(r.evaluation_timesteps), num_evaluation_iterations)
            for num_timesteps in r.evaluation_timesteps:
                self.assertLessEqual(num_timesteps, max_evaluation_timesteps)

        runner.run(
            num_episodes=10, evaluation_callback=evaluation_callback,
            evaluation_frequency=evaluation_frequency,
            max_evaluation_timesteps=max_evaluation_timesteps,
            num_evaluation_iterations=num_evaluation_iterations
        )

        runner.close()
        sys.stdout.flush()
        self.assertTrue(expr=True)
コード例 #28
0
ファイル: test_trpo_agent.py プロジェクト: et0803/tensorforce
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                cg_iterations=20,
                cg_damping=0.001,
                line_search_steps=20,
                max_kl_divergence=0.05,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (continuous): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO continuous agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #29
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   repeat_update=4,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=5000, episode_finished=episode_finished)
            print('DQN Agent: ' + str(runner.episode))
            if runner.episode < 5000:
                passed += 1
                print('passed')
            else:
                print('failed')

        print('DQN Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #30
0
    def test_discrete(self):
        passed = 0

        # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis
        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.0001,
                                   cg_iterations=20,
                                   cg_damping=0.001,
                                   line_search_steps=20,
                                   max_kl_divergence=0.05,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #31
0
def main():

    bad_seeds_environment = Environment.create(
        environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100
    )

    agent = Agent.create(
        agent="a2c",
        batch_size=100,
        horizon=100,     # changed from 20 to 100 for agent_03
        exploration=0.05,  # changed from 0.01 to 0.05 for agent_03
        l2_regularization=0.2,  # changed from 0.1 to 0.2 for agent_03
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.1,  # changed from 0.05 to 0.1 for agent_03
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_03_env_03/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
        saver=dict(
            directory='saved_models/agent_03_env_03/checkpoints',
            frequency=600  # save checkpoint every 600 seconds (10 minutes)
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    for _ in range(10):
        runner.run(num_episodes=10000)
        runner.run(num_episodes=1000, evaluation=True)

    bad_seeds_environment.close()
    agent.close()
コード例 #32
0
    def train_and_test(self, agent, n_steps, n_tests, early_stop):
        test_acc = self.acc.tests
        n_steps = n_steps * 10000
        test_acc.n_tests = n_tests
        test_acc.i = 0
        timesteps_each = n_steps // n_tests
        runner = Runner(agent=agent, environment=self)

        try:
            while test_acc.i <= n_tests:
                self.use_dataset(Mode.TRAIN)
                # max_episode_timesteps not required, since we kill on (cash|value)<0 or max_repeats
                runner.run(timesteps=timesteps_each)
                self.use_dataset(Mode.TEST)
                self.run_deterministic(runner, print_results=True)
                if early_stop > 0:
                    sharpes = np.array(self.acc.episode.sharpes[-early_stop:])
                    if test_acc.i >= early_stop and np.all(sharpes > 0):
                        test_acc.i = n_tests
                test_acc.i += 1
        except KeyboardInterrupt:
            # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're
            # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_steps
            # are the more methodical approaches)
            pass

        # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance)
        print('Running no-kill test-set')
        self.use_dataset(Mode.TEST, full_set=True)
        self.run_deterministic(runner, print_results=True)
コード例 #33
0
    def test_restore_from_checkpoint(self):
        saver_steps = 15
        steps_per_episode = 20
        train_episodes = 2

        assert ((steps_per_episode + 1) * train_episodes % saver_steps) > 0

        environment = DummyEnv()
        network_spec = [
            dict(type='dense', size=4)
        ]
        model_path = self._tmp_dir_path + "/model_auto_save"

        saver_spec = dict(
            directory=model_path,
            steps=saver_steps,
            load=False
        )
        agent = create_agent(environment, network_spec, saver_spec)
        runner = Runner(agent=agent, environment=environment)

        runner.run(max_episode_timesteps=steps_per_episode, episodes=train_episodes)
        # Deliberately avoid closing the runner/agent to simulate unexpected shutdown

        agent = create_agent(environment, network_spec)
        agent.restore_model(directory=model_path)
        agent.reset()
        expected_timestep = train_episodes * (steps_per_episode + 1) // saver_steps * saver_steps
        assert agent.episode == train_episodes - 1
        assert agent.timestep == expected_timestep

        runner = Runner(agent=agent, environment=environment)
        runner.run(max_episode_timesteps=steps_per_episode, episodes=train_episodes)
        assert agent.episode == 2 * train_episodes - 1
        runner.close()
コード例 #34
0
def main():
    parser = argparse.ArgumentParser(description="Train an IBM agent")
    parser.add_argument("--render",
                        default=False,
                        action='store_true',
                        help="Whether to render or not. Defaults to False.")
    args = parser.parse_args()

    for n_simple in [3]:  #[1, 2, 3]:

        agent, environment = make_agent_env(1, n_simple, args.render)
        agent = restore_agent(agent)

        # Run
        runner = Runner(agent=agent, environment=environment)
        while True:
            runner.run(episodes=100, max_episode_timesteps=2000)
            ave_reward = np.mean(runner.episode_rewards)
            print("Average reward: %f with %d SimpleAgents" %
                  (ave_reward, n_simple))

            directory = os.path.join(os.getcwd(), "log", "agent")
            runner.agent.save_model(directory=directory)

            if ave_reward > 0 and n_simple < 3:
                break
            if ave_reward > 0.9:
                break

        try:
            runner.close()
        except AttributeError as e:
            pass
コード例 #35
0
ファイル: test_naf_agent.py プロジェクト: et0803/tensorforce
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                exploration=dict(type='ornstein_uhlenbeck'),
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                clip_gradients=1.0,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
                # batch_size=8,
                # learning_rate=0.0025,
                # # exploration="OrnsteinUhlenbeckProcess",
                # # exploration_kwargs=dict(
                # #     sigma=0.1,
                # #     mu=0,
                # #     theta=0.1
                # # ),
                # discount=0.99,
                # memory_capacity=800,
                # first_update=80,
                # repeat_update=4,
                # target_update_frequency=20,
                # states=environment.states,
                # actions=environment.actions,
                # clip_gradients=5.0,
                # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)])
            )
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('NAF Agent: ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('NAF Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 3)
コード例 #36
0
ファイル: test_dqfd_agent.py プロジェクト: et0803/tensorforce
    def test_dqfd_agent(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=16,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                demo_memory_capacity=100,
                demo_sampling_ratio=0.1,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder(layers_config=[dict(type='dense', size=32, l2_regularization=0.0001)])
            )
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
コード例 #37
0
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = TRPOAgent(config=Configuration(
                loglevel='info',
                batch_size=100,
                baseline='mlp',
                baseline_args=None,
                baseline_kwargs=dict(
                    size=32,
                    repeat_update=100
                ),
                override_line_search=False,
                generalized_advantage_estimation=True,
                normalize_advantage=False,
                gae_lambda=0.97,
                cg_iterations=20,
                cg_damping=0.01,
                line_search_steps=20,
                max_kl_divergence=0.005,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
コード例 #38
0
ファイル: openai_universe.py プロジェクト: et0803/tensorforce
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', default='DQNAgent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode")
    # parser.add_argument('-m', '--monitor', help="Save results to this directory")
    # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    env = OpenAIUniverse(args.gym_id)
    env.configure(remotes=1)

    default = dict(
        repeat_actions=1,
        actions=env.actions,
        states=env.states,
        max_episode_length=args.max_timesteps
    )

    if args.agent_config:
        config = Configuration.from_json(args.agent_config)
    else:
        config = Configuration()

    config.default(default)

    if args.network_config:
        network_config = Configuration.from_json(args.network_config).network_layers
    else:
        if config.network_layers:
            network_config = config.network_layers
        else:
            raise TensorForceError("Error: No network configuration provided.")

    if args.debug:
        print("Configuration:")
        print(config)

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[config.loglevel])

    stack = None

    agent = create_agent(args.agent, config, network_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(config)

    runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))
        runner.save_model(args.save, args.save_episodes)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:])))
            logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:])))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    if args.monitor:
        env.gym.monitor.close()
    env.close()
コード例 #39
0
ファイル: openai_gym.py プロジェクト: et0803/tensorforce
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode")
    parser.add_argument('-m', '--monitor', help="Save results to this directory")
    parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # configurable!!!

    environment = OpenAIGym(args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video)

    if args.agent_config:
        agent_config = Configuration.from_json(args.agent_config)
    else:
        agent_config = Configuration()
        logger.info("No agent configuration provided.")
    if args.network_config:
        network = from_json(args.network_config)
    else:
        network = None
        logger.info("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions, network=network))
    agent = agents[args.agent](config=agent_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        save_path=args.save,
        save_episodes=args.save_episodes
    )

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    if args.monitor:
        environment.gym.monitor.close()
    environment.close()
コード例 #40
0
ファイル: quickstart.py プロジェクト: et0803/tensorforce
    line_search_steps=20,
    max_kl_divergence=0.005,
    gamma=0.97,
    continuous=False,
    preprocessing=None,
    states=env.states,
    actions=env.actions,
    network=layered_network_builder([dict(type='dense', size=32, activation='tanh'),
                                     dict(type='dense', size=32, activation='tanh')])
))

# Create the runner
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):
    print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.timestep,
                                                                                 reward=r.episode_rewards[-1]))
    return True


# Start learning
runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished)

# Print statistics
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(ep=runner.episode,
                                                                                                   ar=np.mean(
                                                                                                       runner.episode_rewards[
                                                                                                       -100:])))
コード例 #41
0
def main():
    parser = argparse.ArgumentParser(description="Playground Flags.")
    parser.add_argument("--game",
                        default="pommerman",
                        help="Game to choose.")
    parser.add_argument("--config",
                        default="PommeFFA-v0",
                        help="Configuration to execute. See env_ids in "
                        "configs.py for options.")
    parser.add_argument("--agents",
                        default="tensorforce::ppo,test::agents.SimpleAgent,"
                        "test::agents.SimpleAgent,test::agents.SimpleAgent",
                        help="Comma delineated list of agent types and docker "
                        "locations to run the agents.")
    parser.add_argument("--agent_env_vars",
                        help="Comma delineated list of agent environment vars "
                        "to pass to Docker. This is only for the Docker Agent."
                        " An example is '0:foo=bar:baz=lar,3:foo=lam', which "
                        "would send two arguments to Docker Agent 0 and one to"
                        " Docker Agent 3.",
                        default="")
    parser.add_argument("--record_pngs_dir",
                        default=None,
                        help="Directory to record the PNGs of the game. "
                        "Doesn't record if None.")
    parser.add_argument("--record_json_dir",
                        default=None,
                        help="Directory to record the JSON representations of "
                        "the game. Doesn't record if None.")
    parser.add_argument("--render",
                        default=True,
                        help="Whether to render or not. Defaults to True.")
    parser.add_argument("--game_state_file",
                        default=None,
                        help="File from which to load game state. Defaults to "
                        "None.")
    args = parser.parse_args()

    config = args.config
    record_pngs_dir = args.record_pngs_dir
    record_json_dir = args.record_json_dir
    agent_env_vars = args.agent_env_vars
    game_state_file = args.game_state_file

    # TODO: After https://github.com/MultiAgentLearning/playground/pull/40
    #       this is still missing the docker_env_dict parsing for the agents.
    agents = [
        helpers.make_agent_from_string(agent_string, agent_id+1000)
        for agent_id, agent_string in enumerate(args.agents.split(","))
    ]

    env = make(config, agents, game_state_file)
    training_agent = None

    for agent in agents:
        if type(agent) == TensorForceAgent:
            training_agent = agent
            env.set_training_agent(agent.agent_id)
            break

    if args.record_pngs_dir:
        assert not os.path.isdir(args.record_pngs_dir)
        os.makedirs(args.record_pngs_dir)
    if args.record_json_dir:
        assert not os.path.isdir(args.record_json_dir)
        os.makedirs(args.record_json_dir)

    # Create a Proximal Policy Optimization agent
    agent = training_agent.initialize(env)

    atexit.register(functools.partial(clean_up_agents, agents))
    wrapped_env = WrappedEnv(env, visualize=args.render)
    runner = Runner(agent=agent, environment=wrapped_env)
    runner.run(episodes=10, max_episode_timesteps=2000)
    print("Stats: ", runner.episode_rewards, runner.episode_timesteps,
          runner.episode_times)

    try:
        runner.close()
    except AttributeError as e:
        pass
コード例 #42
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode")
    parser.add_argument('-w', '--num-workers', type=int, default=1, help="Number of worker agents")
    parser.add_argument('-m', '--monitor', help="Save results to this file")
    parser.add_argument('-M', '--mode', choices=['tmux', 'child'], default='tmux', help="Starter mode")
    parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory")
    parser.add_argument('-C', '--is-child', action='store_true')
    parser.add_argument('-i', '--task-index', type=int, default=0, help="Task index")
    parser.add_argument('-K', '--kill', action='store_true', default=False, help="Kill runners")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    session_name = 'openai_async'
    shell = '/bin/bash'

    kill_cmds = [
        "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers),
        "tmux kill-session -t {}".format(session_name),
    ]
    if args.kill:
        os.system("\n".join(kill_cmds))
        return 0

    if not args.is_child:
        # start up child processes
        target_script = os.path.abspath(inspect.stack()[0][1])

        def wrap_cmd(session, name, cmd):
            if isinstance(cmd, list):
                cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd)
            if args.mode == 'tmux':
                return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd))
            elif args.mode == 'child':
                return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format(
                    cmd, args.logdir, session, name, args.logdir
                )

        def build_cmd(index):
            cmd_args = [
                'CUDA_VISIBLE_DEVICES=',
                sys.executable, target_script,
                args.gym_id,
                '--is-child',
                '--agent', args.agent,
                '--agent-config', os.path.join(os.getcwd(), args.agent_config),
                '--network-config', os.path.join(os.getcwd(), args.network_config),
                '--num-workers', args.num_workers,
                '--task-index', index
            ]
            if args.debug:
                cmd_args.append('--debug')
            return cmd_args

        if args.mode == 'tmux':
            cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)]
        elif args.mode == 'child':
            cmds = ['mkdir -p {}'.format(args.logdir),
                    'rm -f {}/kill.sh'.format(args.logdir),
                    'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir),
                    'chmod +x {}/kill.sh'.format(args.logdir)]
        cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1)))

        for i in xrange(args.num_workers):
            name = 'w_{}'.format(i)
            if args.mode == 'tmux':
                cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell))
            cmds.append(wrap_cmd(session_name, name, build_cmd(i)))

        # add one PS call
        # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell))

        print("\n".join(cmds))

        os.system("\n".join(cmds))

        return 0

    ps_hosts = ['127.0.0.1:{}'.format(12222)]
    worker_hosts = []
    port = 12223
    for _ in range(args.num_workers):
        worker_hosts.append('127.0.0.1:{}'.format(port))
        port += 1
    cluster = {'ps': ps_hosts, 'worker': worker_hosts}
    cluster_spec = tf.train.ClusterSpec(cluster)

    environment = OpenAIGym(args.gym_id)

    if args.agent_config:
        agent_config = Configuration.from_json(args.agent_config)
    else:
        raise TensorForceError("No agent configuration provided.")
    if not args.network_config:
        raise TensorForceError("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(args.network_config)))

    agent_config.default(dict(distributed=True, cluster_spec=cluster_spec, global_model=(args.task_index == -1), device=('/job:ps' if args.task_index == -1 else '/job:worker/task:{}/cpu:0'.format(args.task_index))))

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[agent_config.loglevel])

    agent = agents[args.agent](config=agent_config)

    logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id))
    logger.info("Config:")
    logger.info(agent_config)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        cluster_spec=cluster_spec,
        task_index=args.task_index
    )

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
コード例 #43
0
ファイル: lab_main.py プロジェクト: et0803/tensorforce
def main():
    parser = argparse.ArgumentParser()

    # N.b. if ran from within lab, the working directory is something like lab/bazel-out/../../tensorforce
    # Hence, relative paths will not work without first fetching the path of this run file
    parser.add_argument('-id', '--level-id', default='tests/demo_map',help="DeepMind Lab level id")
    parser.add_argument('-a', '--agent', default='VPGAgent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=1000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=200, help="Maximum number of timesteps per episode")
    parser.add_argument('-m', '--monitor', help="Save results to this directory")
    parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=True, help="Show debug outputs")

    # Redirect output to file
    sys.stdout = open('lab_output.txt', 'w')

    args = parser.parse_args()

    environment = DeepMindLab(args.level_id)

    path = os.path.dirname(__file__)
    if args.agent_config:
        # Use absolute path
        agent_config = Configuration.from_json(path + args.agent_config, True)
    else:
        raise TensorForceError("No agent configuration provided.")
    if not args.network_config:
        raise TensorForceError("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions,
                              network=from_json(path + args.network_config, True)))

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)  # configurable!!!

    agent = agents[args.agent](config=agent_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        save_path=args.save,
        save_episodes=args.save_episodes
    )
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    report_episodes = args.episodes // 1000

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:])))
            logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:])))
        return True

    logger.info("Starting {agent} for Lab environment '{env}'".format(agent=agent, env=environment))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1))

    environment.close()