Ejemplo n.º 1
0
def main():

    # maximum reward is approximately max_episode_length / seed_count = 10
    bad_seeds_environment = Environment.create(
        environment=BadSeeds01, seed_count=10, bad_seed_count=3, max_episode_length=100
    )

    agent = Agent.create(
        agent="a2c",
        batch_size=100,  # this seems to help a2c
        horizon=20,  # does this help a2c?
        exploration=0.01,  # tried without this at first
        l2_regularization=0.1,
        entropy_regularization=0.2,
        variable_noise=0.05,
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_01_env_01/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
    )

    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=100000)
    agent.save(directory="saved_models")
    def __init__(self,
                 environment: 'TradingEnvironment',
                 agent_spec: any,
                 save_best_agent: bool = True,
                 **kwargs):
        """
        Arguments:
            environment: A `TradingEnvironment` instance for the agent to trade within.
            agent: A `Tensorforce` agent or agent specification.
            save_best_agent (optional): The runner will automatically save the best agent
            kwargs (optional): Optional keyword arguments to adjust the strategy.
        """
        self._max_episode_timesteps = kwargs.get('max_episode_timesteps',
                                                 False)

        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._agent = Agent.create(agent=agent_spec,
                                   environment=self._environment)

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=save_best_agent)
Ejemplo n.º 3
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('VPG Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('VPG discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 4
0
    def test_multi(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            return state0 * state1

        for _ in xrange(5):
            environment = MinimalTest(definition=[True, (True, 2)])
            config = Configuration(batch_size=16,
                                   learning_rate=0.00025,
                                   exploration=dict(type='ornstein_uhlenbeck'),
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 20 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-20:], r.episode_lengths[-20:]))

            runner.run(episodes=10000, episode_finished=episode_finished)
            print('NAF agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 10000:
                passed += 1

        print('NAF agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 0)
Ejemplo n.º 5
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=50,
                                   memory='replay',
                                   first_update=20,
                                   repeat_update=4,
                                   target_update_frequency=10,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder(
                                       [dict(type='dense', size=32)]))
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('Replay DQN: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('Replay DQN passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 6
0
    def test_discrete(self):
        passed = 0

        # TRPO can occasionally have numerical issues so we allow for 1 in 5 to fail on Travis
        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.0001,
                cg_iterations=20,
                cg_damping=0.001,
                line_search_steps=20,
                max_kl_divergence=0.05,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (discrete): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO discrete agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 7
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(definition=True)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   exploration=dict(type='ornstein_uhlenbeck'),
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('NAF agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('NAF agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 8
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
            )
            agent = DQNAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQN Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQN Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 9
0
def main():
    gym_id = 'CartPole-v0'
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    max_episodes = 10000
    max_timesteps = 1000

    env = OpenAIGymEnvironment(gym_id, monitor=False, monitor_video=False)

    config = Config({
        'repeat_actions': 1,
        'actions': env.actions,
        'action_shape': env.action_shape,
        'state_shape': env.state_shape,
        'exploration': 'constant',
        'exploration_args': [0.1]
    })

    agent = SimpleQAgent(config, "simpleq")

    runner = Runner(agent, env)

    def episode_finished(r):
        if r.episode % 10 == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 10 rewards: {}".format(np.mean(r.episode_rewards[-10:])))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))
    runner.run(max_episodes, max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1))
Ejemplo n.º 10
0
    def train_and_test(self, agent, early_stop=-1, n_tests=15):
        n_train = TIMESTEPS // n_tests
        i = 0
        runner = Runner(agent=agent, environment=self)

        try:
            while i <= n_tests:
                self.use_dataset(Mode.TRAIN)
                runner.run(timesteps=n_train, max_episode_timesteps=n_train)
                self.use_dataset(Mode.TEST)
                self.run_deterministic(runner, print_results=True)
                if early_stop > 0:
                    advantages = np.array(
                        self.acc.episode.advantages[-early_stop:])
                    if i >= early_stop and np.all(advantages > 0):
                        i = n_tests
                i += 1
        except KeyboardInterrupt:
            # Lets us kill training with Ctrl-C and skip straight to the final test. This is useful in case you're
            # keeping an eye on terminal and see "there! right there, stop you found it!" (where early_stop & n_tests
            # are the more methodical approaches)
            pass

        # On last "how would it have done IRL?" run, without getting in the way (no killing on repeats, 0-balance)
        print('Running no-kill test-set')
        self.use_dataset(Mode.TEST, no_kill=True)
        self.run_deterministic(runner, print_results=True)
Ejemplo n.º 11
0
def main():
    env, agent = set_up()
    runner = Runner(agent=agent, environment=env)
    runner.run(num_episodes=10000)
    agent.save(directory="saved_models")
    agent.close()
    env.close()
Ejemplo n.º 12
0
    def test_continuous(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                cg_iterations=20,
                cg_damping=0.001,
                line_search_steps=20,
                max_kl_divergence=0.05,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            )
            agent = TRPOAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('TRPO Agent (continuous): ' + str(runner.episode))

            if runner.episode < 2000:
                passed += 1

        print('TRPO continuous agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 13
0
    def play(self, model_name, num_ep=5):
        self.load(model_name)

        print('Evaluating...')
        self.runner = Runner(agent=self.ppo_agent,
                             environment=dict(type=self.poker_env))
        self.runner.run(num_episodes=num_ep, evaluation=True)
        self.runner.close()
    def test_multi(self):
        passed = 0

        def network_builder(inputs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'], size=32), size=32)
            state1 = layer(x=layer(x=inputs['state1'], size=32), size=32)
            state2 = layer(x=layer(x=inputs['state2'], size=32), size=32)
            return state0 * state1 * state2

        for _ in xrange(5):
            environment = MinimalTest(
                definition=[False, (False, 2), (False, (1, 2))])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   demo_memory_capacity=100,
                                   demo_sampling_ratio=0.2,
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = dict(action0=1, action1=(1, 1), action2=((1, 1), ))
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 50 or not all(
                    x >= 1.0 for x in r.episode_rewards[-50:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 15
0
    def unittest(self,
                 num_updates=None,
                 num_episodes=None,
                 num_timesteps=None,
                 environment=None,
                 min_timesteps=None,
                 states=None,
                 actions=None,
                 exclude_bool_action=False,
                 exclude_int_action=False,
                 exclude_float_action=False,
                 exclude_bounded_action=False,
                 require_observe=False,
                 require_all=False,
                 **agent):
        """
        Generic unit-test.
        """
        agent, environment = self.prepare(
            environment=environment,
            min_timesteps=min_timesteps,
            states=states,
            actions=actions,
            exclude_bool_action=exclude_bool_action,
            exclude_int_action=exclude_int_action,
            exclude_float_action=exclude_float_action,
            exclude_bounded_action=exclude_bounded_action,
            require_observe=require_observe,
            require_all=require_all,
            **agent)

        self.runner = Runner(agent=agent, environment=environment)

        assert (num_updates is not None) + (num_episodes is not None) + \
            (num_timesteps is not None) <= 1
        if num_updates is None and num_episodes is None and num_timesteps is None:
            num_updates = self.__class__.num_updates
            num_episodes = self.__class__.num_episodes
            num_timesteps = self.__class__.num_timesteps
        if num_updates is None and num_episodes is None and num_timesteps is None:
            num_updates = 2
        assert (num_updates is not None) + (num_episodes is not None) + \
            (num_timesteps is not None) == 1

        evaluation = not any([
            require_all, require_observe, self.__class__.require_all,
            self.__class__.require_observe
        ])
        self.runner.run(num_episodes=num_episodes,
                        num_timesteps=num_timesteps,
                        num_updates=num_updates,
                        use_tqdm=False,
                        evaluation=evaluation)
        self.runner.close()
        agent.close()
        environment.close()

        self.finished_test()
    def environment(self, environment: 'TradingEnvironment'):
        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=self._save_best_agent)
Ejemplo n.º 17
0
def main():
    #tensorforce
    env = OpenAIGym('JacoArm-v0')

    agent = TRPOAgent(states_spec=env.states,
                      actions_spec=env.actions,
                      network_spec=network_spec,
                      batch_size=512)

    # agent = PPOAgent(
    # 	states_spec=env.states,
    # 	actions_spec=env.actions,
    # 	network_spec=network_spec,
    # 	batch_size=512,
    # 	step_optimizer=dict(
    # 		type='adam',
    # 		learning_rate=1e-4
    # 	)
    # )

    runner = Runner(agent=agent, environment=env)

    raw_input("hit enter when gazebo is loaded...")
    print()
    env.gym.unpause()
    env.gym.hold_init_robot_pos([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0])
    runner.run(episodes=1500,
               max_episode_timesteps=1000,
               episode_finished=episode_finished)

    #old-fashioned way
    # env = gym.make('JacoArm-v0')
    # print "launching the world..."
    # #gz loaing issues, let user start the learning
    # raw_input("hit enter when gazebo is loaded...")
    # env.set_physics_update(0.0001, 10000)
    # raw_input("hit enter when gazebo is loaded...")

    # # env.set_goal([0.167840578046, 0.297489331432, 0.857454500127])

    # total_episodes = 100
    # action = [1,1,1,1,1,1,1,1,1,1]
    # x = 0
    # # for x in range(total_episodes):
    # while True:
    # 	# if x % 10 is 0:
    # 	action = numpy.random.rand(1, 10)[0]
    # 		# print 'new action is', action

    # 	state, reward, done, _ = env.step(action)
    # 	print reward
    # 	time.sleep(0.2)
    # 	x += 1

    write_to_csv(train_data, 'test.csv')
    env.close()
Ejemplo n.º 18
0
    def test_discrete(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(definition=False)
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   memory_capacity=800,
                                   first_update=80,
                                   target_update_frequency=20,
                                   demo_memory_capacity=100,
                                   demo_sampling_ratio=0.2,
                                   memory=dict(type='replay',
                                               random_sampling=True),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=layered_network_builder([
                                       dict(type='dense', size=32),
                                       dict(type='dense', size=32)
                                   ]))
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 19
0
    def train(self, model_name, num_ep=500):

        print('Training...')
        self.runner = Runner(agent='ppo.json',
                             environment=dict(type=self.poker_env),
                             num_parallel=5,
                             remote='multiprocessing')
        self.runner.run(num_episodes=num_ep)
        self.runner.agent.save(directory=model_name, format='hdf5')
        self.runner.close()
Ejemplo n.º 20
0
def main(
    *,
    time_limit=None,
    scoring="default",
    batch_size=16,
    gpu_idx=0,
    env_version=2,
    out_path=None,
    num_episodes=int(3 * 10 ** 3),
):
    """
    A self contained set up of the environment and run.
    Can be used to create all of the figures associated in the reference for variable batch size and
    variable time limit. All experiments use 10 'seeds'.

    Parameters
    ----------
    time_limit : int, None
        Turn time limit for episode
    scoring : str in {'t22', 'tt5', 'monotonic', 'linear', 'square', 'default'
        Name of reward function
    batch_size : int
        Batch size for training
    gpu_idx : int
        optional index for GPU
    env_version : int in {1, 2}
        Environment version. 1 being ideal time, 2 being time limited
    out_path : path
        Toplevel dir for output of models and checkpoints
    num_episodes: int
        Number of episodes to learn over

    Returns
    -------
    None

    """
    env, agent = set_up(
        time_limit=time_limit,
        scoring=scoring,
        batch_size=batch_size,
        gpu_idx=gpu_idx,
        env_version=env_version,
        out_path=out_path,
    )
    runner = Runner(agent=agent, environment=env)
    runner.run(num_episodes=num_episodes)
    if out_path is None:
        out_path = Path()
    else:
        out_path = Path(out_path).expanduser()
    agent.save(directory=str(out_path / "saved_models"))
    agent.close()
    env.close()
Ejemplo n.º 21
0
    def test_dqfd_agent(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=16,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                demo_memory_capacity=100,
                demo_sampling_ratio=0.1,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder(layers_config=[
                    dict(type='dense', size=32, l2_regularization=0.0001)
                ]))
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state,
                                     action=action,
                                     reward=reward,
                                     terminal=terminal,
                                     internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
    def restore_agent(self, directory: str, filename: str = None):
        """Deserialize the strategy's learning agent from a file.

        Arguments:
            directory: The `str` path of the directory the agent checkpoint is stored in.
            filename (optional): The `str` path of the file the agent specification is stored in.
                The `.json` file extension will be automatically appended if not provided.
        """
        self._agent = Agent.load(directory, filename=filename)

        self._runner = Runner(agent=self._agent, environment=self._environment)
    def test_example(self):
        sys.stdout.write('\nQuickstart:\n')
        sys.stdout.flush()

        passed = 0
        for _ in xrange(3):

            # Create an OpenAI-Gym environment
            environment = OpenAIGym('CartPole-v0')

            # Network specification for the model
            network_spec = [
                dict(type='dense', size=32),
                dict(type='dense', size=32)
            ]

            # Create the agent
            agent = PPOAgent(states_spec=environment.states,
                             actions_spec=environment.actions,
                             network_spec=network_spec,
                             batch_size=4000,
                             step_optimizer=dict(type='adam',
                                                 learning_rate=1e-2),
                             optimization_steps=5,
                             discount=0.99,
                             normalize_rewards=False,
                             entropy_regularization=0.01,
                             likelihood_ratio_clipping=0.2)

            # Initialize the runner
            runner = Runner(agent=agent, environment=environment)

            # Function handle called after each finished episode
            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                mean_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or mean_reward < 50.0

            # Start the runner
            runner.run(episodes=2000,
                       max_episode_timesteps=200,
                       episode_finished=episode_finished)

            sys.stdout.write('episodes: {}\n'.format(runner.episode))
            sys.stdout.flush()

            # Test passed if episode_finished handle evaluated to False
            if runner.episode < 2000:
                passed += 1

        sys.stdout.write('==> passed: {}\n'.format(passed))
        sys.stdout.flush()
        self.assertTrue(passed >= 2)
Ejemplo n.º 24
0
    def test_runner_callback(self):
        states = dict(type='float', shape=(1,))

        actions = dict(type='int', shape=(), num_values=3)

        agent, environment = self.prepare(name='runner-callback', states=states, actions=actions)
        environment.timestep_range = (6, 10)

        runner = Runner(agent=agent, environment=environment)

        callback_episode_frequency = 2
        self.num_callbacks = 0

        def callback(r):
            self.num_callbacks += 1
            self.assertEqual(r.episode, self.num_callbacks * callback_episode_frequency)

        runner.run(
            num_episodes=10, callback=callback,
            callback_episode_frequency=callback_episode_frequency
        )

        callback_timestep_frequency = 3
        self.num_callbacks = 0

        def callback(r):
            self.num_callbacks += 1
            self.assertEqual(r.episode_timestep, self.num_callbacks * callback_timestep_frequency)

        runner.run(
            num_episodes=11, callback=callback,
            callback_timestep_frequency=callback_timestep_frequency
        )

        self.is_callback1 = False
        self.is_callback2 = False

        def callback1(r):
            self.is_callback1 = True

        def callback2(r):
            self.is_callback2 = True

        runner.run(
            num_episodes=12, callback=[callback1, callback2],
            callback_timestep_frequency=callback_timestep_frequency
        )

        self.assertTrue(expr=(self.is_callback1 and self.is_callback2))

        runner.close()
        sys.stdout.flush()
        self.assertTrue(expr=True)
Ejemplo n.º 25
0
def main():

    bad_seeds_environment = Environment.create(environment=Bollux,
                                               seed_count=10,
                                               bad_seed_count=3,
                                               max_episode_length=100)

    # 20200820-223031
    # 20200820-233243

    # batch_size 1000 goes not get smarter or dumber
    # batch_size 100 20200821-095410 gets dumber
    # try batch size 10000 !

    agent = Agent.create(
        agent="a2c",
        batch_size=10000,  # changed for 04 but was this a mistake? no
        horizon=50,  # changed from 100 to 50 for agent_04
        discount=0.97,  # new for agent_04
        #exploration=0.05,  # turned off for agent_04 - turn on for 05?
        l2_regularization=0.1,
        #entropy_regularization=0.2,  # turned off for agent_03
        variable_noise=0.5,  # changed from 0.1 to 0.5 for agent_04
        environment=bad_seeds_environment,
        summarizer=dict(
            directory="training_data/agent_04_bollux_1000000/summaries",
            # list of labels, or 'all'
            labels=["graph", "entropy", "kl-divergence", "losses", "rewards"],
            frequency=100,  # store values every 100 timesteps
        ),
        saver=dict(
            directory='saved_models/agent_04_bollux_1000000/checkpoints',
            frequency=6000  # save checkpoint every 6000 seconds (100 minutes)
        ),
    )

    # this is the batch_size = 10000 version
    # I hope it is the last env 04
    runner = Runner(agent=agent, environment=bad_seeds_environment)
    runner.run(num_episodes=1000000)
    #for i in range(100):
    #    print("running 10000 episodes")
    #    runner.run(num_episodes=10000)
    #    print("saving the agent")
    #    directory = Path(f"saved_models/agent_04_env_04_1000000/10000_{i}/checkpoints")
    #    if directory.exists():
    #        directory.rmdir()
    #    directory.mkdir(parents=True, exist_ok=True)
    #    agent.save(directory=str(directory), format="numpy")

    bad_seeds_environment.close()
    agent.close()
Ejemplo n.º 26
0
    def test_continuous(self):
        environment = MinimalTest(definition=True)
        config = Configuration(states=environment.states,
                               actions=environment.actions)
        agent = RandomAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(
                x >= 1.0 for x in r.episode_rewards[-100:])

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Random agent (continuous): ' + str(runner.episode))
        self.assertTrue(runner.episode == 1000)
Ejemplo n.º 27
0
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = PPOAgent(config=Configuration(
                log_level='info',
                batch_size=256,

                memory=dict(
                    type='prioritized_replay',
                ),
                update_frequency=256,
                first_update=512,

                learning_rate=0.0001,
                optimizer_batch_size=64,
                normalize_rewards=False,
                gae_rewards=False,
                baseline=dict(
                    type="mlp",
                    sizes=[32, 32],
                    epochs=1,
                    update_batch_size=64,
                    learning_rate=0.001
                ),
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
Ejemplo n.º 28
0
class Player:
    """Mandatory class with the player methods"""
    def __init__(self, name='ppo_agent', load_model=None, env=None):
        """Initialization of an agent"""
        self.equity_alive = 0
        self.actions = []
        self.last_action_in_stage = ''
        self.temp_stack = []
        self.name = name
        self.autoplay = True

        self.ppo_agent = None
        self.poker_env = Environment.create(environment=env,
                                            max_episode_timesteps=100)
        self.runner = None

        if load_model:
            self.load(load_model)

    def load(self, model_name):
        print("Loading model...")
        self.ppo_agent = Agent.load(directory=model_name, format='hdf5')

    def start_step_policy(self, observation):
        log.info("Random action")
        _ = observation
        action = self.poker_env.action_space.sample()
        return action

    def train(self, model_name, num_ep=500):

        print('Training...')
        self.runner = Runner(agent='ppo.json',
                             environment=dict(type=self.poker_env),
                             num_parallel=5,
                             remote='multiprocessing')
        self.runner.run(num_episodes=num_ep)
        self.runner.agent.save(directory=model_name, format='hdf5')
        self.runner.close()

    def play(self, model_name, num_ep=5):
        self.load(model_name)

        print('Evaluating...')
        self.runner = Runner(agent=self.ppo_agent,
                             environment=dict(type=self.poker_env))
        self.runner.run(num_episodes=num_ep, evaluation=True)
        self.runner.close()

    def action(self, action_space, observation, info):
        _ = observation
        _ = info

        this_player_action_space = {
            Action.FOLD, Action.CHECK, Action.CALL, Action.RAISE_POT,
            Action.RAISE_HALF_POT, Action.RAISE_2POT
        }
        action = this_player_action_space.intersection(set(action_space))

        return action
Ejemplo n.º 29
0
    def test_multi_baseline(self):
        passed = 0

        def network_builder(inputs, **kwargs):
            layer = layers['dense']
            state0 = layer(x=layer(x=inputs['state0'],
                                   size=32,
                                   scope='state0-1'),
                           size=32,
                           scope='state0-2')
            state1 = layer(x=layer(x=inputs['state1'],
                                   size=32,
                                   scope='state1-1'),
                           size=32,
                           scope='state1-2')
            state2 = layer(x=layer(x=inputs['state2'],
                                   size=32,
                                   scope='state2-1'),
                           size=32,
                           scope='state2-2')
            return state0 * state1 * state2

        for _ in xrange(5):
            environment = MinimalTest(
                definition=[False, (False, 2), (True, 2)])
            config = Configuration(batch_size=8,
                                   learning_rate=0.001,
                                   baseline=dict(type="mlp",
                                                 sizes=[32, 32],
                                                 epochs=5,
                                                 update_batch_size=8,
                                                 learning_rate=0.01),
                                   states=environment.states,
                                   actions=environment.actions,
                                   network=network_builder)
            agent = VPGAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x / l >= reward_threshold for x, l in zip(
                        r.episode_rewards[-100:], r.episode_lengths[-100:]))

            runner.run(episodes=4000, episode_finished=episode_finished)
            print('VPG agent (multi-state/action): ' + str(runner.episode))
            if runner.episode < 4000:
                passed += 1

        print('VPG agent (multi-state/action) passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 30
0
    def test_discrete(self):
        environment = MinimalTest(definition=False)
        config = Configuration(states=environment.states,
                               actions=environment.actions)
        agent = RandomAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 100 or not all(x / l >= 0.9 for x, l in zip(
                r.episode_rewards[-100:], r.episode_lengths[-100:]))

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Random agent (discrete): ' + str(runner.episode))
        self.assertTrue(runner.episode == 1000)
Ejemplo n.º 31
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                exploration=dict(type='ornstein_uhlenbeck'),
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                clip_gradients=1.0,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
                # batch_size=8,
                # learning_rate=0.0025,
                # # exploration="OrnsteinUhlenbeckProcess",
                # # exploration_kwargs=dict(
                # #     sigma=0.1,
                # #     mu=0,
                # #     theta=0.1
                # # ),
                # discount=0.99,
                # memory_capacity=800,
                # first_update=80,
                # repeat_update=4,
                # target_update_frequency=20,
                # states=environment.states,
                # actions=environment.actions,
                # clip_gradients=5.0,
                # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)])
            )
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(
                    x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('NAF Agent: ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('NAF Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 3)
Ejemplo n.º 32
0
    def base_test_pass(self, name, environment, network_spec, **kwargs):
        """
        Basic test loop, requires an Agent to achieve a certain performance on an environment.
        """

        sys.stdout.write('\n{} ({}):'.format(self.__class__.agent.__name__, name))
        sys.stdout.flush()

        passed = 0
        for _ in xrange(3):

            if self.__class__.requires_network:
                agent = self.__class__.agent(
                    states_spec=environment.states,
                    actions_spec=environment.actions,
                    network_spec=network_spec,
                    **kwargs
                )
            else:
                agent = self.__class__.agent(
                    states_spec=environment.states,
                    actions_spec=environment.actions,
                    **kwargs
                )

            runner = Runner(agent=agent, environment=environment)

            self.pre_run(agent=agent, environment=environment)

            def episode_finished(r):
                episodes_passed = [
                    rw / ln >= self.__class__.pass_threshold
                    for rw, ln in zip(r.episode_rewards[-100:], r.episode_timesteps[-100:])
                ]
                return r.episode < 100 or not all(episodes_passed)

            runner.run(episodes=3000, deterministic=self.__class__.deterministic, episode_finished=episode_finished)

            sys.stdout.write(' ' + str(runner.episode))
            sys.stdout.flush()
            if all(rw / ln >= self.__class__.pass_threshold
                    for rw, ln in zip(runner.episode_rewards[-100:], runner.episode_timesteps[-100:])):
                passed += 1
            if passed == 2:
                break

        sys.stdout.write(' ==> {} passed\n'.format(passed))
        sys.stdout.flush()
        self.assertTrue(passed >= 2)
Ejemplo n.º 33
0
    def test_multi(self):
        environment = MinimalTest(
            definition=[False, (False, 2), (False, (1, 2)), (True, (1, 2))])
        config = Configuration(states=environment.states,
                               actions=environment.actions)
        agent = RandomAgent(config=config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(r):
            return r.episode < 20 or not all(x >= 1.0
                                             for x in r.episode_rewards[-20:])

        runner.run(episodes=1000, episode_finished=episode_finished)
        print('Random agent (multi-state/action): ' + str(runner.episode))
        self.assertTrue(runner.episode == 1000)
Ejemplo n.º 34
0
    def test_dqfd_agent(self):
        passed = 0

        for _ in xrange(5):
            environment = MinimalTest(continuous=False)
            config = Configuration(
                batch_size=16,
                learning_rate=0.001,
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                demo_memory_capacity=100,
                demo_sampling_ratio=0.1,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder(layers_config=[dict(type='dense', size=32, l2_regularization=0.0001)])
            )
            agent = DQFDAgent(config=config)

            # First generate demonstration data and pretrain
            demonstrations = list()
            terminal = True

            for n in xrange(50):
                if terminal:
                    state = environment.reset()
                action = 1
                state, reward, terminal = environment.execute(action=action)
                demonstration = dict(state=state, action=action, reward=reward, terminal=terminal, internal=[])
                demonstrations.append(demonstration)

            agent.import_demonstrations(demonstrations)
            agent.pretrain(steps=1000)

            # Normal training
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=1000, episode_finished=episode_finished)
            print('DQFD Agent: ' + str(runner.episode))
            if runner.episode < 1000:
                passed += 1

        print('DQFD Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 4)
Ejemplo n.º 35
0
    def test_naf_agent(self):

        passed = 0
        for _ in xrange(5):
            environment = MinimalTest(continuous=True)
            config = Configuration(
                batch_size=8,
                learning_rate=0.001,
                exploration=dict(type='ornstein_uhlenbeck'),
                memory_capacity=800,
                first_update=80,
                repeat_update=4,
                target_update_frequency=20,
                clip_gradients=1.0,
                states=environment.states,
                actions=environment.actions,
                network=layered_network_builder([dict(type='dense', size=32)])
                # batch_size=8,
                # learning_rate=0.0025,
                # # exploration="OrnsteinUhlenbeckProcess",
                # # exploration_kwargs=dict(
                # #     sigma=0.1,
                # #     mu=0,
                # #     theta=0.1
                # # ),
                # discount=0.99,
                # memory_capacity=800,
                # first_update=80,
                # repeat_update=4,
                # target_update_frequency=20,
                # states=environment.states,
                # actions=environment.actions,
                # clip_gradients=5.0,
                # network=layered_network_builder([dict(type='dense', size=32), dict(type='dense', size=32)])
            )
            agent = NAFAgent(config=config)
            runner = Runner(agent=agent, environment=environment)

            def episode_finished(r):
                return r.episode < 100 or not all(x >= 1.0 for x in r.episode_rewards[-100:])

            runner.run(episodes=2000, episode_finished=episode_finished)
            print('NAF Agent: ' + str(runner.episode))
            if runner.episode < 2000:
                passed += 1

        print('NAF Agent passed = {}'.format(passed))
        self.assertTrue(passed >= 3)
Ejemplo n.º 36
0
    def run_experiment(self, environment, experiment_num=0):
        config = copy(self.config)

        max_episodes = config.pop('max_episodes')
        max_episode_timesteps = config.pop('max_episode_timesteps')

        network_spec = config.pop('network')

        agent = Agent.from_spec(
            spec=config,
            kwargs=dict(
                states_spec=environment.states,
                actions_spec=environment.actions,
                network_spec=network_spec
            )
        )

        if experiment_num == 0 and self.history_data:
            logging.info("Attaching history data to runner")
            history_data = self.history_data
        else:
            history_data = None

        if experiment_num == 0 and self.load_model_file:
            logging.info("Loading model data from file: {}".format(self.load_model))
            agent.load_model(self.load_model_file)

        runner = Runner(
            agent=agent,
            environment=environment,
            repeat_actions=1,
            history=history_data
            # save_path=args.model,
            # save_episodes=args.save_model
        )

        environment.reset()
        agent.reset()

        runner.run(episodes=max_episodes, max_episode_timesteps=max_episode_timesteps,
                   episode_finished=self.episode_finished)

        return dict(
            initial_reset_time=0,
            episode_rewards=runner.episode_rewards,
            episode_timesteps=runner.episode_timesteps,
            episode_end_times=runner.episode_times
        )
Ejemplo n.º 37
0
    def test_example(self):
        passed = 0

        for _ in xrange(3):
            # Create an OpenAIgym environment
            env = OpenAIGym('CartPole-v0')

            # Create a Trust Region Policy Optimization agent
            agent = TRPOAgent(config=Configuration(
                loglevel='info',
                batch_size=100,
                baseline='mlp',
                baseline_args=None,
                baseline_kwargs=dict(
                    size=32,
                    repeat_update=100
                ),
                override_line_search=False,
                generalized_advantage_estimation=True,
                normalize_advantage=False,
                gae_lambda=0.97,
                cg_iterations=20,
                cg_damping=0.01,
                line_search_steps=20,
                max_kl_divergence=0.005,
                states=env.states,
                actions=env.actions,
                network=layered_network_builder([
                    dict(type='dense', size=32, activation='tanh'),
                    dict(type='dense', size=32, activation='tanh')
                ])
            ))
            runner = Runner(agent=agent, environment=env)

            def episode_finished(r):
                # Test if mean reward over 50 should ensure that learning took off
                avg_reward = np.mean(r.episode_rewards[-50:])
                return r.episode < 100 or avg_reward < 50.0

            runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished)

            if runner.episode < 2000:
                passed += 1

        print('Quick start example passed = {}'.format(passed))
        self.assertTrue(passed >= 2)
Ejemplo n.º 38
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode")
    parser.add_argument('-w', '--num-workers', type=int, default=1, help="Number of worker agents")
    parser.add_argument('-m', '--monitor', help="Save results to this file")
    parser.add_argument('-M', '--mode', choices=['tmux', 'child'], default='tmux', help="Starter mode")
    parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory")
    parser.add_argument('-C', '--is-child', action='store_true')
    parser.add_argument('-i', '--task-index', type=int, default=0, help="Task index")
    parser.add_argument('-K', '--kill', action='store_true', default=False, help="Kill runners")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    session_name = 'openai_async'
    shell = '/bin/bash'

    kill_cmds = [
        "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers),
        "tmux kill-session -t {}".format(session_name),
    ]
    if args.kill:
        os.system("\n".join(kill_cmds))
        return 0

    if not args.is_child:
        # start up child processes
        target_script = os.path.abspath(inspect.stack()[0][1])

        def wrap_cmd(session, name, cmd):
            if isinstance(cmd, list):
                cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd)
            if args.mode == 'tmux':
                return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd))
            elif args.mode == 'child':
                return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format(
                    cmd, args.logdir, session, name, args.logdir
                )

        def build_cmd(index):
            cmd_args = [
                'CUDA_VISIBLE_DEVICES=',
                sys.executable, target_script,
                args.gym_id,
                '--is-child',
                '--agent', args.agent,
                '--agent-config', os.path.join(os.getcwd(), args.agent_config),
                '--network-config', os.path.join(os.getcwd(), args.network_config),
                '--num-workers', args.num_workers,
                '--task-index', index
            ]
            if args.debug:
                cmd_args.append('--debug')
            return cmd_args

        if args.mode == 'tmux':
            cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)]
        elif args.mode == 'child':
            cmds = ['mkdir -p {}'.format(args.logdir),
                    'rm -f {}/kill.sh'.format(args.logdir),
                    'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir),
                    'chmod +x {}/kill.sh'.format(args.logdir)]
        cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1)))

        for i in xrange(args.num_workers):
            name = 'w_{}'.format(i)
            if args.mode == 'tmux':
                cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell))
            cmds.append(wrap_cmd(session_name, name, build_cmd(i)))

        # add one PS call
        # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell))

        print("\n".join(cmds))

        os.system("\n".join(cmds))

        return 0

    ps_hosts = ['127.0.0.1:{}'.format(12222)]
    worker_hosts = []
    port = 12223
    for _ in range(args.num_workers):
        worker_hosts.append('127.0.0.1:{}'.format(port))
        port += 1
    cluster = {'ps': ps_hosts, 'worker': worker_hosts}
    cluster_spec = tf.train.ClusterSpec(cluster)

    environment = OpenAIGym(args.gym_id)

    if args.agent_config:
        agent_config = Configuration.from_json(args.agent_config)
    else:
        raise TensorForceError("No agent configuration provided.")
    if not args.network_config:
        raise TensorForceError("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions, network=from_json(args.network_config)))

    agent_config.default(dict(distributed=True, cluster_spec=cluster_spec, global_model=(args.task_index == -1), device=('/job:ps' if args.task_index == -1 else '/job:worker/task:{}/cpu:0'.format(args.task_index))))

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[agent_config.loglevel])

    agent = agents[args.agent](config=agent_config)

    logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id))
    logger.info("Config:")
    logger.info(agent_config)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        cluster_spec=cluster_spec,
        task_index=args.task_index
    )

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
Ejemplo n.º 39
0
def main():
    parser = argparse.ArgumentParser(description="Playground Flags.")
    parser.add_argument("--game",
                        default="pommerman",
                        help="Game to choose.")
    parser.add_argument("--config",
                        default="PommeFFA-v0",
                        help="Configuration to execute. See env_ids in "
                        "configs.py for options.")
    parser.add_argument("--agents",
                        default="tensorforce::ppo,test::agents.SimpleAgent,"
                        "test::agents.SimpleAgent,test::agents.SimpleAgent",
                        help="Comma delineated list of agent types and docker "
                        "locations to run the agents.")
    parser.add_argument("--agent_env_vars",
                        help="Comma delineated list of agent environment vars "
                        "to pass to Docker. This is only for the Docker Agent."
                        " An example is '0:foo=bar:baz=lar,3:foo=lam', which "
                        "would send two arguments to Docker Agent 0 and one to"
                        " Docker Agent 3.",
                        default="")
    parser.add_argument("--record_pngs_dir",
                        default=None,
                        help="Directory to record the PNGs of the game. "
                        "Doesn't record if None.")
    parser.add_argument("--record_json_dir",
                        default=None,
                        help="Directory to record the JSON representations of "
                        "the game. Doesn't record if None.")
    parser.add_argument("--render",
                        default=True,
                        help="Whether to render or not. Defaults to True.")
    parser.add_argument("--game_state_file",
                        default=None,
                        help="File from which to load game state. Defaults to "
                        "None.")
    args = parser.parse_args()

    config = args.config
    record_pngs_dir = args.record_pngs_dir
    record_json_dir = args.record_json_dir
    agent_env_vars = args.agent_env_vars
    game_state_file = args.game_state_file

    # TODO: After https://github.com/MultiAgentLearning/playground/pull/40
    #       this is still missing the docker_env_dict parsing for the agents.
    agents = [
        helpers.make_agent_from_string(agent_string, agent_id+1000)
        for agent_id, agent_string in enumerate(args.agents.split(","))
    ]

    env = make(config, agents, game_state_file)
    training_agent = None

    for agent in agents:
        if type(agent) == TensorForceAgent:
            training_agent = agent
            env.set_training_agent(agent.agent_id)
            break

    if args.record_pngs_dir:
        assert not os.path.isdir(args.record_pngs_dir)
        os.makedirs(args.record_pngs_dir)
    if args.record_json_dir:
        assert not os.path.isdir(args.record_json_dir)
        os.makedirs(args.record_json_dir)

    # Create a Proximal Policy Optimization agent
    agent = training_agent.initialize(env)

    atexit.register(functools.partial(clean_up_agents, agents))
    wrapped_env = WrappedEnv(env, visualize=args.render)
    runner = Runner(agent=agent, environment=wrapped_env)
    runner.run(episodes=10, max_episode_timesteps=2000)
    print("Stats: ", runner.episode_rewards, runner.episode_timesteps,
          runner.episode_times)

    try:
        runner.close()
    except AttributeError as e:
        pass
Ejemplo n.º 40
0
    gae_lambda=0.97,
    cg_iterations=20,
    cg_damping=0.01,
    line_search_steps=20,
    max_kl_divergence=0.005,
    gamma=0.97,
    continuous=False,
    preprocessing=None,
    states=env.states,
    actions=env.actions,
    network=layered_network_builder([dict(type='dense', size=32, activation='tanh'),
                                     dict(type='dense', size=32, activation='tanh')])
))

# Create the runner
runner = Runner(agent=agent, environment=env)


# Callback function printing episode statistics
def episode_finished(r):
    print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.timestep,
                                                                                 reward=r.episode_rewards[-1]))
    return True


# Start learning
runner.run(episodes=3000, max_timesteps=200, episode_finished=episode_finished)

# Print statistics
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(ep=runner.episode,
                                                                                                   ar=np.mean(
Ejemplo n.º 41
0
def main():
    parser = argparse.ArgumentParser()

    # N.b. if ran from within lab, the working directory is something like lab/bazel-out/../../tensorforce
    # Hence, relative paths will not work without first fetching the path of this run file
    parser.add_argument('-id', '--level-id', default='tests/demo_map',help="DeepMind Lab level id")
    parser.add_argument('-a', '--agent', default='VPGAgent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=1000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=200, help="Maximum number of timesteps per episode")
    parser.add_argument('-m', '--monitor', help="Save results to this directory")
    parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=True, help="Show debug outputs")

    # Redirect output to file
    sys.stdout = open('lab_output.txt', 'w')

    args = parser.parse_args()

    environment = DeepMindLab(args.level_id)

    path = os.path.dirname(__file__)
    if args.agent_config:
        # Use absolute path
        agent_config = Configuration.from_json(path + args.agent_config, True)
    else:
        raise TensorForceError("No agent configuration provided.")
    if not args.network_config:
        raise TensorForceError("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions,
                              network=from_json(path + args.network_config, True)))

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)  # configurable!!!

    agent = agents[args.agent](config=agent_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        save_path=args.save,
        save_episodes=args.save_episodes
    )
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    report_episodes = args.episodes // 1000

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:])))
            logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:])))
        return True

    logger.info("Starting {agent} for Lab environment '{env}'".format(agent=agent, env=environment))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1))

    environment.close()
Ejemplo n.º 42
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', default='DQNAgent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode")
    # parser.add_argument('-m', '--monitor', help="Save results to this directory")
    # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    env = OpenAIUniverse(args.gym_id)
    env.configure(remotes=1)

    default = dict(
        repeat_actions=1,
        actions=env.actions,
        states=env.states,
        max_episode_length=args.max_timesteps
    )

    if args.agent_config:
        config = Configuration.from_json(args.agent_config)
    else:
        config = Configuration()

    config.default(default)

    if args.network_config:
        network_config = Configuration.from_json(args.network_config).network_layers
    else:
        if config.network_layers:
            network_config = config.network_layers
        else:
            raise TensorForceError("Error: No network configuration provided.")

    if args.debug:
        print("Configuration:")
        print(config)

    logger = logging.getLogger(__name__)
    logger.setLevel(log_levels[config.loglevel])

    stack = None

    agent = create_agent(args.agent, config, network_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(config)

    runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))
        runner.save_model(args.save, args.save_episodes)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:])))
            logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:])))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    if args.monitor:
        env.gym.monitor.close()
    env.close()
Ejemplo n.º 43
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a', '--agent', help='Agent')
    parser.add_argument('-c', '--agent-config', help="Agent configuration file")
    parser.add_argument('-n', '--network-config', help="Network configuration file")
    parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes")
    parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode")
    parser.add_argument('-m', '--monitor', help="Save results to this directory")
    parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # configurable!!!

    environment = OpenAIGym(args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video)

    if args.agent_config:
        agent_config = Configuration.from_json(args.agent_config)
    else:
        agent_config = Configuration()
        logger.info("No agent configuration provided.")
    if args.network_config:
        network = from_json(args.network_config)
    else:
        network = None
        logger.info("No network configuration provided.")
    agent_config.default(dict(states=environment.states, actions=environment.actions, network=network))
    agent = agents[args.agent](config=agent_config)

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError("Could not load agent from {}: No such directory.".format(load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError("Cannot save agent to dir {} ()".format(save_dir))

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1,
        save_path=args.save,
        save_episodes=args.save_episodes
    )

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            logger.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))
    runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    if args.monitor:
        environment.gym.monitor.close()
    environment.close()