Esempio n. 1
0
    def __init__(self, fruit_environment, **kwargs):
        super().__init__()
        if isinstance(fruit_environment, BaseEnvironment):
            self.environment = fruit_environment
            if self.environment.get_processor() is not None:
                raise ValueError('Do not use state processor with TensorForce !')
        else:
            raise ValueError('Environment must be from FruitAPI !')

        self.__max_episode_timesteps = False

        state_space = self.environment.get_state_space()
        self.states_spec = dict(type='float', shape=tuple(state_space.get_shape()))

        action_ranges, _ = self.environment.get_action_space().get_range()
        self.actions_spec = dict(type='int', num_values=len(action_ranges))

        self.__timesteps = 0

        if isinstance(fruit_environment, GymEnvironment):
            self.__max_episode_timesteps = None
            _, self.__max_episode_timesteps = OpenAIGym.create_level(level=self.environment.env_name,
                                                                     max_episode_timesteps=self.__max_episode_timesteps,
                                                                     reward_threshold=None, tags=None, **kwargs)

            self.states_spec = OpenAIGym.specs_from_gym_space(
                space=self.environment.env.observation_space, ignore_value_bounds=True)

            self.actions_spec = OpenAIGym.specs_from_gym_space(
                space=self.environment.env.action_space, ignore_value_bounds=False)
Esempio n. 2
0
    def test_quickstart(self):
        sys.stdout.write('\nQuickstart:\n')
        sys.stdout.flush()

        # Create an OpenAI-Gym environment
        environment = OpenAIGym('CartPole-v1')

        # Create the agent
        agent = PPOAgent(
            states=environment.states(),
            actions=environment.actions(),
            # Automatically configured network
            network='auto',
            # Memory sampling most recent experiences, with a capacity of 2500 timesteps
            # (6100 > [30 batch episodes] * [200 max timesteps per episode])
            memory=6100,
            # Update every 10 episodes, with a batch of 30 episodes
            update_mode=dict(unit='episodes', batch_size=30, frequency=10),
            # PPO optimizer
            step_optimizer=dict(type='adam', learning_rate=1e-3),
            # PPO multi-step optimization: 10 updates, each based on a third of the batch
            subsampling_fraction=0.33,
            optimization_steps=10,
            # MLP baseline
            baseline_mode='states',
            baseline=dict(type='network', network='auto'),
            # Baseline optimizer
            baseline_optimizer=dict(type='multi_step',
                                    optimizer=dict(type='adam',
                                                   learning_rate=1e-4),
                                    num_steps=5),
            # Other parameters
            discount=0.99,
            entropy_regularization=1e-2,
            gae_lambda=None,
            likelihood_ratio_clipping=0.2)

        # Initialize the runner
        runner = Runner(agent=agent, environment=environment)

        # Function handle called after each finished episode
        def callback(r):
            return float(np.mean(r.episode_rewards[-100:])) <= 180.0

        # Start the runner
        runner.run(num_episodes=1000,
                   max_episode_timesteps=200,
                   callback=callback)
        runner.close()

        if float(np.mean(runner.episode_rewards[-100:])) <= 180.0:
            sys.stdout.write('Test failed, exceeding {} episodes\n'.format(
                runner.episode))
            sys.stdout.flush()
            self.assertTrue(expr=False)
        else:
            sys.stdout.write('Test passed after {} episodes\n'.format(
                runner.episode))
            sys.stdout.flush()
            self.assertTrue(expr=True)
Esempio n. 3
0
 def get_states(self):
     state = self.environment.get_state()
     if isinstance(self.environment, GymEnvironment):
         state = OpenAIGym.flatten_state(state=state, states_spec=self.states_spec)
     else:
         state = state.astype(dtype=np.float32) / 255.0
     return state
def main():
    # Create an OpenAI-Gym environment
    environment = OpenAIGym('CartPole-v1')

    # Create the agent
    agent = PPOAgent(
        states=environment.states(),
        actions=environment.actions(),
        # Automatically configured network
        network='auto',
        # Memory sampling most recent experiences, with a capacity of 2500 timesteps
        # (6100 > [30 batch episodes] * [200 max timesteps per episode])
        memory=6100,
        # Update every 10 episodes, with a batch of 30 episodes
        update_mode=dict(unit='episodes', batch_size=30, frequency=10),
        # PPO optimizer
        step_optimizer=dict(type='adam', learning_rate=1e-3),
        # PPO multi-step optimization: 10 updates, each based on a third of the batch
        subsampling_fraction=0.33,
        optimization_steps=10,
        # MLP baseline
        baseline_mode='states',
        baseline=dict(type='network', network='auto'),
        # Baseline optimizer
        baseline_optimizer=dict(type='multi_step',
                                optimizer=dict(type='adam',
                                               learning_rate=1e-4),
                                num_steps=5),
        # Other parameters
        discount=0.99,
        entropy_regularization=1e-2,
        gae_lambda=None,
        likelihood_ratio_clipping=0.2)

    # Initialize the runner
    runner = Runner(agent=agent, environment=environment)

    # Start the runner
    runner.run(num_episodes=1000, max_episode_timesteps=200)
    runner.close()
Esempio n. 5
0
    def __init__(self,
                 level,
                 visualize=False,
                 monitor_directory=None,
                 **kwargs):
        import retro

        self._max_episode_timesteps = False

        self.level = level
        self.visualize = visualize

        self.environment = retro.make(game=self.level, **kwargs)

        if monitor_directory is not None:
            self.environment = gym.wrappers.Monitor(
                env=self.environment, directory=monitor_directory)

        self.states_spec = OpenAIGym.specs_from_gym_space(
            space=self.environment.observation_space,
            ignore_value_bounds=True  # TODO: not ignore?
        )
        self.actions_spec = OpenAIGym.specs_from_gym_space(
            space=self.environment.action_space, ignore_value_bounds=False)
Esempio n. 6
0
    def execute(self, actions):
        if isinstance(self.environment, GymEnvironment):
            actions = OpenAIGym.unflatten_action(action=actions)

        reward = self.environment.step(actions)
        terminal = self.environment.is_terminal()
        self.__timesteps += 1
        if self.__max_episode_timesteps is not None:
            if self.__timesteps > self.__max_episode_timesteps:
                terminal = 2
            elif terminal:
                terminal = 1
            else:
                terminal = 0

        states = self.get_states()

        return states, terminal, reward
Esempio n. 7
0
def main():
    #Creates a log for MineRL
    #logging.basicConfig(level=logging.DEBUG)

    # Create the environment
    ENV_NAME = "MineRLTreechop-v0"

    # Pre-defined or custom environment
    env = gym.make(ENV_NAME)

    environment = OpenAIGym(env)

    agent = Agent.create(agent='ac',
                         environment=environment,
                         max_episode_timesteps=8000,
                         exploration=.03,
                         critic_optimizer='evolutionary')

    sum_rewards = 0.0
    rewards_by_episode = []
    for _ in range(200):
        states = environment.reset()
        terminal = False
        print("Training episode " + str(_))
        while not terminal:
            actions = agent.act(states=states, evaluation=True)
            states, terminal, reward = environment.execute(actions=actions)
            sum_rewards += reward
            #print(actions)
        print("Sum reward so far: " + str(sum_rewards))
        rewards_by_episode.append((_, sum_rewards))
        print("Ending episode ", _)
    print(rewards_by_episode)
    print('Mean episode reward:', sum_rewards / 200)

    agent.close()
    environment.close()
def main():
    parser = argparse.ArgumentParser()
    # Gym arguments
    parser.add_argument('-g', '--gym', help="Gym environment id")
    parser.add_argument('-i',
                        '--import-modules',
                        help="Import module(s) required for gym environment")
    parser.add_argument('--monitor',
                        type=str,
                        default=None,
                        help="Save results to this directory")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled)")
    parser.add_argument('--visualize',
                        action='store_true',
                        default=False,
                        help="Enable OpenAI Gym's visualization")
    # Agent arguments
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    # Runner arguments
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    args = parser.parse_args()

    if args.import_modules is not None:
        for module in args.import_modules.split(','):
            importlib.import_module(name=module)

    environment = OpenAIGym(gym_id=args.gym,
                            monitor=args.monitor,
                            monitor_safe=args.monitor_safe,
                            monitor_video=args.monitor_video,
                            visualize=args.visualize)

    agent = Agent.from_spec(spec=args.agent,
                            states=environment.states(),
                            actions=environment.actions(),
                            network=args.network)

    runner = Runner(agent=agent, environment=environment)

    def callback(r):
        if r.episode % 100 == 0:
            print("================================================\n"
                  "Average secs/episode over 100 episodes: {time:0.2f}\n"
                  "Average steps/sec over 100 episodes:    {timestep:0.2f}\n"
                  "Average reward over 100 episodes:       {reward100:0.2f}\n"
                  "Average reward over 500 episodes:       {reward500:0.2f}".
                  format(time=(sum(r.episode_times[-100:]) / 100.0),
                         timestep=(sum(r.episode_timesteps[-100:]) /
                                   sum(r.episode_times[-100:])),
                         reward100=(sum(r.episode_rewards[-100:]) /
                                    min(100.0, r.episode)),
                         reward500=(sum(r.episode_rewards[-500:]) /
                                    min(500.0, r.episode))))
        return True

    runner.run(num_timesteps=args.timesteps,
               num_episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               callback=callback)

    runner.close()
Esempio n. 9
0
def conversion_2():
    env = OpenAIGym(level='FrozenLake8x8-v0', visualize=False)
    create_random_agent(env)
Esempio n. 10
0
def compatible_2():
    print('+++++++++++++++++++++++++++++++++++++++++++++++++')
    fruit_env = GymEnvironment(env_name='CartPole-v1')
    state = fruit_env.get_state_space()
    print(state.get_range())
    print(tuple(state.get_shape()))
    print(fruit_env.get_action_space().get_range())
    print(fruit_env.reset())
    print(fruit_env.get_state())
    print('+++++++++++++++++++++++++++++++++++++++++++++++++')

    print('+++++++++++++++++++++++++++++++++++++++++++++++++')
    env = OpenAIGym(level='CartPole-v1')
    state = env.states()
    print(state)
    print(env.actions())
    print(env.reset())
    print(env.execute(0))
    print(env.max_episode_timesteps())
    print('+++++++++++++++++++++++++++++++++++++++++++++++++')

    print('+++++++++++++++++++++++++++++++++++++++++++++++++')
    env = TFEnvironment(fruit_environment=fruit_env)
    print(env.states())
    print(env.actions())
    print(env.getrobotics_states())
    print(env.execute(0))
    print(env.max_episode_timesteps())
    print('+++++++++++++++++++++++++++++++++++++++++++++++++')
Esempio n. 11
0
from tensorforce import Agent, Environment
from tensorforce.agents import PPOAgent
from tensorforce.environments import OpenAIGym

# Pre-defined or custom environment
# environment = Environment.create(
#     environment='gym', level='CartPole', max_episode_timesteps=500
# )


# environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)
environment = OpenAIGym('LunarLanderContinuous-v2', visualize=True, max_episode_steps=500)
# environment = OpenAIGym('BipedalWalker-v3', visualize=False, max_episode_steps=500)


agent = Agent.create(
    agent='ppo', environment=environment, batch_size=10,
    network=[
        dict(type='dense', size=64),
        dict(type='dense', size=64)
    ],
    learning_rate=1e-3,
    name='agent_loader'

)
# import ipdb;ipdb.set_trace()
agent = agent.load()

running_score = 0.0
# Train for 300 episodes
for i_epoch in range(50000):
Esempio n. 12
0
from tensorforce import Agent, Environment
from tensorforce.agents import PPOAgent
from tensorforce.environments import OpenAIGym

# Pre-defined or custom environment
# environment = Environment.create(
#     environment='gym', level='CartPole', max_episode_timesteps=500
# )

# Network as list of layers
network_spec = [
    dict(type='dense', size=32, activation='tanh'),
    dict(type='dense', size=32, activation='tanh')
]

environment = OpenAIGym('CartPole-v0', visualize=True, max_episode_steps=500)

# Instantiate a Tensorforce agent
# agent = Agent.create(
#     agent='tensorforce',
#     environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
#     memory=10000,
#     update=dict(unit='timesteps', batch_size=64),
#     optimizer=dict(type='adam', learning_rate=3e-4),
#     policy=dict(network='auto'),
#     objective='policy_gradient',
#     reward_estimation=dict(horizon=20)
# )

agent = Agent.create(agent='ppo',
                     environment=environment,
Esempio n. 13
0
import gym
from tensorforce import Agent, Runner
from tensorforce.environments import OpenAIGym, Environment
from gym.wrappers import Monitor
from matplotlib import animation
import matplotlib.pyplot as plt
import random
import time

random.seed(time.time_ns())

EPISODES = 1

environment = OpenAIGym(
    level='CartPole-v1',
    visualize=True,
    visualize_directory=f'visualization_{random.randint(0, 1000)}',
)

agent = Agent.load(directory='model', environment=environment)

# runner = Runner(agent=agent, environment=environment, max_episode_timesteps=500)
# runner.run(num_episodes=EPISODES, evaluation=True)
# runner.close()

sum_rewards = 0.0
for _ in range(EPISODES):
    states = environment.reset()
    internals = agent.initial_internals()
    terminal = False
    while not terminal: