def __init__(self,actions): preprocessing_config = [ { "type": "grayscale" } ] exploration_config = dict( type="epsilon_anneal", initial_epsilon=0.25, final_epsilon=0.01, timesteps=1000000 ) network_spec = [ dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'), dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'), dict(type='flatten'), dict(type='dense', size=256, activation='lrelu') ] self.network_path = "network/" self.agent = PPOAgent( actions = dict(type='int', num_actions=len(actions)), states = dict(type='float', shape=(35, 150, 3)), network = network_spec, actions_exploration = exploration_config, states_preprocessing = preprocessing_config )
def initialize(self, env): from gym import spaces from tensorforce.agents import PPOAgent self.env = env if self.algorithm == "ppo": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_actions': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_actions=env.action_space.n) self.agent = PPOAgent(states=dict( type='float', shape=env.observation_space.shape), actions=actions, network=[ dict(type='dense', size=64), dict(type='dense', size=64) ], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) self.restore_model_if_exists(self.checkpoint) return self.agent
def __init__(self, frame_shape=None, game_inputs=None): if frame_shape is None: raise SerpentError("A 'frame_shape' tuple kwarg is required...") states_spec = {"type": "float", "shape": frame_shape} if game_inputs is None: raise SerpentError("A 'game_inputs' dict kwarg is required...") self.game_inputs = game_inputs self.game_inputs_mapping = self._generate_game_inputs_mapping() actions_spec = {"type": "int", "num_actions": len(self.game_inputs)} network_spec = [ {"type": "conv2d", "size": 32, "window": 8, "stride": 4}, {"type": "conv2d", "size": 64, "window": 4, "stride": 2}, {"type": "conv2d", "size": 64, "window": 3, "stride": 1}, {"type": "flatten"}, {"type": "dense", "size": 512} ] self.agent = PPOAgent( states_spec=states_spec, actions_spec=actions_spec, batched_observe=128, scope="ppo", summary_spec=None, network_spec=network_spec, device=None, session_config=None, saver_spec=None, distributed_spec=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=1e-2, batch_size=128, keep_last_timestep=True, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, likelihood_ratio_clipping=None, step_optimizer=None, optimization_steps=10 )
def initialize(self, env, parallel_interactions=1, summarizer=None, saver=None): from gym import spaces from tensorforce.agents import PPOAgent self.env = env if self.algorithm == "ppo": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_values': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_values=env.action_space.n) self.tf_agent = PPOAgent( states=dict(type='float', shape=env.observation_space.shape), actions=actions, max_episode_timesteps=2000, network=[ dict(type='dense', size=64), dict(type='dense', size=64) ], # critic_network=[ # dict(type='dense', size=64), # dict(type='dense', size=64) # ], parallel_interactions=parallel_interactions, summarizer=summarizer, saver=saver, execution={ 'num_parallel': 64, 'type': 'single', 'session_config': None, 'distributed_spec': None }, batch_size=10) # batching_capacity=1000, # step_optimizer=dict(type='adam', learning_rate=1e-4)) return self.tf_agent return None
def get_ppo_agent(): return PPOAgent( states=dict(type='float', shape=(5, )), actions=dict(type='int', num_actions=2), network=[ dict(type='dense', size=20, activation='tanh'), dict(type='dense', size=20, activation='tanh'), ], #batch_size=256, # BatchAgent #keep_last_timestep=True, # PPOAgent step_optimizer=dict(type='adam', learning_rate=1e-3), optimization_steps=10, # Model scope='ppo', discount=0.99, # DistributionModel #distributions_spec=None, entropy_regularization=0.01, # PGModel baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, # PGLRModel likelihood_ratio_clipping=0.2, #summary_spec=None, #distributed_spec=None, )
def test_quickstart(self): sys.stdout.write('\nQuickstart:\n') sys.stdout.flush() # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v1') # Create the agent agent = PPOAgent( states=environment.states(), actions=environment.actions(), # Automatically configured network network='auto', # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (6100 > [30 batch episodes] * [200 max timesteps per episode]) memory=6100, # Update every 10 episodes, with a batch of 30 episodes update_mode=dict(unit='episodes', batch_size=30, frequency=10), # PPO optimizer step_optimizer=dict(type='adam', learning_rate=1e-3), # PPO multi-step optimization: 10 updates, each based on a third of the batch subsampling_fraction=0.33, optimization_steps=10, # MLP baseline baseline_mode='states', baseline=dict(type='network', network='auto'), # Baseline optimizer baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5), # Other parameters discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Function handle called after each finished episode def callback(r): return float(np.mean(r.episode_rewards[-100:])) <= 180.0 # Start the runner runner.run(num_episodes=1000, max_episode_timesteps=200, callback=callback) runner.close() if float(np.mean(runner.episode_rewards[-100:])) <= 180.0: sys.stdout.write('Test failed, exceeding {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=False) else: sys.stdout.write('Test passed after {} episodes\n'.format( runner.episode)) sys.stdout.flush() self.assertTrue(expr=True)
def get_ppo_agent(environment, *args, **kwargs): with open('config/cnn_network.json', 'r') as infile: network = json.load(infile) agent = PPOAgent( states=environment.states, actions=environment.actions, network=network, memory={ "type": "latest", "capacity": 40000, "include_next_states": False, }, actions_exploration={ "type": "epsilon_anneal", "initial_epsilon": 1.0, "final_epsilon": 0.05, "timesteps": int(1e7), }, saver={ "directory": "checkpoint/ppo", "seconds": 1800, }, ) return agent
def ppo(env): return PPOAgent( states=dict(type='float', shape=env.state_representation.get_shape()), actions=dict(type='int', num_actions=env.env.action_space.N_ACTIONS), # Automatically configured network #network=dict(type='auto', size=32, depth=2, internal_rnn=True), network=[ dict(type='dense', size=128), dict(type='dense', size=128), dict(type='dense', size=128) ], # Update every 5 episodes, with a batch of 10 episodes update_mode=dict(unit='episodes', batch_size=10, frequency=5), # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (2500 > [10 episodes] * [200 max timesteps per episode]) memory=dict(type='latest', include_next_states=False, capacity=250000), discount=0.99, entropy_regularization=0.01, # MLP baseline baseline_mode='states', baseline=dict(type='mlp', sizes=[32, 32]), # Baseline optimizer baseline_optimizer=dict( type='multi_step', optimizer=dict(type='adam', learning_rate=1e-3), num_steps=5 ), gae_lambda=0.97, likelihood_ratio_clipping=0.2, # PPO optimizer step_optimizer=dict(type='adam', learning_rate=3e-4), # was -4 # PPO multi-step optimization: 25 updates, each calculated for 20% of the batch subsampling_fraction=0.2, optimization_steps=25 )
def initialize(self, env, lstm=False): from gym import spaces from tensorforce.agents import PPOAgent if self.algorithm == "ppo": if type(env.action_space) == spaces.Tuple: actions = { str(num): { 'type': int, 'num_actions': space.n } for num, space in enumerate(env.action_space.spaces) } else: actions = dict(type='int', num_actions=env.action_space.n) network = [ dict(type='conv2d', size=10, window=1, activation='relu'), dict(type='conv2d', size=32, window=5, activation='relu'), dict(type='conv2d', size=16, window=3, activation='relu'), dict(type='flatten'), dict(type='dense', size=256, activation='relu') ] if lstm: network.append(dict(type='internal_lstm', size=256)) return PPOAgent( states=dict(type='float', shape=env.observation_space.shape), actions=actions, network=network, batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) return None
def initialize(self, env): from gym import spaces from tensorforce.agents import PPOAgent if self.algorithm == "ppo": if type(env.action_space) == spaces.Tuple: actions_spec = {str(num): {'type': int, 'num_actions': space.n} for num, space in enumerate(env.action_space.spaces)} else: actions_spec = dict(type='int', num_actions=env.action_space.n) return PPOAgent( states_spec=dict(type='float', shape=env.observation_space.shape), actions_spec=actions_spec, network_spec=[ dict(type='dense', size=64), dict(type='dense', size=64) ], batch_size=128, step_optimizer=dict( type='adam', learning_rate=1e-4 ) ) return None
def createPPO2Agent(): # based on: https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py agent = PPOAgent( states=env.states, actions=env.actions, network=[ dict(type='dense', size=64), dict(type='dense', size=32) ], # Agent states_preprocessing=None, actions_exploration=None, reward_preprocessing=None, # MemoryModel update_mode=dict( unit='episodes', # 10 episodes per update batch_size=10, # Every 10 episodes frequency=10 ), memory=dict( type='latest', include_next_states=False, capacity=5000 ), # DistributionModel distributions=None, entropy_regularization=0.01, # PGModel baseline_mode='states', baseline=dict( type='mlp', sizes=[32, 32] ), baseline_optimizer=dict( type='multi_step', optimizer=dict( type='adam', learning_rate=1e-3 ), num_steps=5 ), gae_lambda=0.97, # PGLRModel likelihood_ratio_clipping=0.2, # PPOAgent step_optimizer=dict( type='adam', learning_rate=1e-3 ), subsampling_fraction=0.2, optimization_steps=25, execution=dict( type='single', session_config=None, distributed_spec=None ) ) return agent
def __init__(self, observation_space, action_space, directory='./TensorforcePPOAgent/'): # Create a Proximal Policy Optimization agent: # This agent is restricted to a 0 or 1 activation. To enable continuous activations, change the action type to "float" and delete "num_actions". self.agent = PPOAgent( states=dict(type='float', shape=observation_space.shape), actions=dict(type='int', shape=action_space.shape, num_actions=2, min_value=0, max_value=1), # This PPO Agent neural network has two dense hidden layers with 256 nodes. network=[ dict(type='dense', size=256), dict(type='dense', size=256), ], # The agent uses an "Adam" optimizer with a learning rate of .0001 batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) self.directory = directory
def __init__(self, observation_space, action_space, directory='./TensorforcePPOAgent/'): # Create a Proximal Policy Optimization agent self.agent = PPOAgent(states=dict(type='float', shape=observation_space.shape), actions=dict(type='float', shape=action_space.shape, min_value=0, max_value=1), network=[ dict(type='dense', size=256, activation='relu'), dict(type='dense', size=128, activation='relu'), dict(type='dense', size=64, activation='relu'), dict(type='dense', size=32, activation='relu'), ], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-2)) self.directory = directory
def main(): env = gym.make('CartPole-v0') # (4,) print(env.observation_space.shape) # [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38] print(env.observation_space.high) # [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] print(env.observation_space.low) # 2 print(env.action_space.n) agent = PPOAgent( states=dict(type='float', shape=env.observation_space.shape), network=[ dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu'), ], actions=dict(type='int', num_actions=env.action_space.n), step_optimizer=dict(type='adam', learning_rate=1e-4) ) model_dir = 'models/cartpole' if os.path.exists(f'{model_dir}/checkpoint'): agent.restore_model(directory=model_dir) try: for ep in range(2000): observation = env.reset() done = False ep_reward = 0 while not done: # env.render() states = observation / 4 action = agent.act(states=states) observation, reward, done, info = env.step(action) agent.observe(reward=reward, terminal=done) ep_reward += reward if done: print(f'ep = {ep}, ep_reward = {ep_reward}') except Exception as e: raise e finally: agent.save_model(directory=f'{model_dir}/agent')
def __init__(self): actions = {} for i in range(12): actions[str(i)] = {'type': 'float'} # 'num_actions': 10 network_spec = [ dict(type='dense', size=100, activation='relu'), dict(type='dense', size=100, activation='relu') ] self.agent = PPOAgent( states=dict(type='float', shape=(12, )), actions=actions, batching_capacity=2000, network=network_spec, step_optimizer=dict(type='adam', learning_rate=1e-4), )
def __init__(self): actions = {} actions_exp = {} for i in range(12): actions[str(i)] = {'type': 'float'} # 'num_actions': 10 actions_exp[str(i)] = dict(type='ornstein_uhlenbeck', sigma=0.1, mu=0.0, theta=0.1) preprocessing_config = [{"type": "standardize"}] preprocessing_config = None customnet = dict(type=CustomNetwork) layerSize = 300 network_spec = [ dict(type='dense', size=100), dict(type='lstm', size=100) ] ''' network_spec = [ dict(type='dense', size=100), dict(type='internal_lstm', size=100) ] ''' network_spec = [ dict(type='dense', size=layerSize, activation='selu'), dict(type='dense', size=layerSize, activation='selu'), dict(type='dense', size=layerSize, activation='selu') ] self.agent = PPOAgent( states=dict(type='float', shape=(12 + 9, )), actions=actions, batching_capacity=1000, network=network_spec, states_preprocessing=preprocessing_config, actions_exploration=actions_exp, step_optimizer=dict(type='adam', learning_rate=1e-5), )
def __init__(self, apikey, agent_id, frames_per_state=1, host=None): # PPO agent seems to learn that it needs to speed around the environment to collect rewards self._agent = PPOAgent( states_spec=dict(type='float', shape=(frames_per_state * 25, )), actions_spec=dict(type='float', shape=(3, ), min_value=np.float32(-1.0), max_value=np.float32(1.0)), network_spec=[ dict(type='dense', activation='relu', size=128), dict(type='dense', activation='relu', size=128), ], optimization_steps=5, # Model scope='ppo', discount=0.99, # DistributionModel distributions_spec=None, entropy_regularization=0.01, # PGModel baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, # PGLRModel likelihood_ratio_clipping=0.2, summary_spec=None, distributed_spec=None, batch_size=2048, step_optimizer=dict(type='adam', learning_rate=1e-4)) self._logger = setup_custom_logger("Controller") self._frame_count_per_episode = 0 self._total_frames = 1 self._frames_per_state = frames_per_state self._client = AsyncClient(apikey, agent_id, self._train_state_callback, host) self._state_stack = StateStack(self._frames_per_state)
def createPPOAgent(): agent = PPOAgent( states = env.states, actions = env.actions, network=[ dict(type='dense', size=networkFirstLayer), dict(type='dense', size=int((networkFirstLayer*networkLastLayer)**0.5)), # geometric average of first and last dict(type='dense', size=networkLastLayer), ], step_optimizer=dict(type='adam', learning_rate=1e-2) ) return agent
def test_example(self): sys.stdout.write('\nQuickstart:\n') sys.stdout.flush() passed = 0 for _ in xrange(3): # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v0') # Network specification for the model network_spec = [ dict(type='dense', size=32), dict(type='dense', size=32) ] # Create the agent agent = PPOAgent(states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, batch_size=4000, step_optimizer=dict(type='adam', learning_rate=1e-2), optimization_steps=5, discount=0.99, normalize_rewards=False, entropy_regularization=0.01, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Function handle called after each finished episode def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off mean_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or mean_reward < 50.0 # Start the runner runner.run(episodes=2000, max_episode_timesteps=200, episode_finished=episode_finished) sys.stdout.write('episodes: {}\n'.format(runner.episode)) sys.stdout.flush() # Test passed if episode_finished handle evaluated to False if runner.episode < 2000: passed += 1 sys.stdout.write('==> passed: {}\n'.format(passed)) sys.stdout.flush() self.assertTrue(passed >= 2)
def __init__(self, name, light, price, quantity, avg_cost_estimate): # initalize product self.name = name # initialize state self.light = light self.quantity = quantity self.avg_cost_estimate = avg_cost_estimate # the approximated cost of each item sold self.price = price # what the price is being set at self.history_log = [] # history of product over time # initalize agent self.agent = PPOAgent( states=dict(type='float', shape=(4)), actions=dict(type='int', num_actions=len(PRICE_CHANGES)), network=[dict(type='dense', size=4), dict(type='dense', size=4)], step_optimizer=dict(type='adam', learning_rate=0.01)) self.agent.initialize_model()
def getAgent(shapeIn, shapeOut): config = Configuration(batch_size=1, step_optimizer=dict(type='adam', learning_rate=1e-4)) # Create a Proximal Policy Optimization agent agent = PPOAgent(dict(type='float', shape=shapeIn[0]), dict(type='float', shape=shapeOut[0]), [ dict(type='dense', size=64), ], config) return agent
def create_agent(environment, network_spec): return PPOAgent(update_mode=dict(unit='episodes', batch_size=4, frequency=4), memory=dict(type='latest', include_next_states=False, capacity=100), step_optimizer=dict(type='adam', learning_rate=1e-3), subsampling_fraction=0.3, optimization_steps=20, states=environment.states, actions=environment.actions, network=network_spec)
class TensorforceAgent: def __init__(self,actions): preprocessing_config = [ { "type": "grayscale" } ] exploration_config = dict( type="epsilon_anneal", initial_epsilon=0.25, final_epsilon=0.01, timesteps=1000000 ) network_spec = [ dict(type='conv2d', size=16, window=8, stride=4, activation='lrelu'), dict(type='conv2d', size=32, window=4, stride=2, activation='lrelu'), dict(type='flatten'), dict(type='dense', size=256, activation='lrelu') ] self.network_path = "network/" self.agent = PPOAgent( actions = dict(type='int', num_actions=len(actions)), states = dict(type='float', shape=(35, 150, 3)), network = network_spec, actions_exploration = exploration_config, states_preprocessing = preprocessing_config ) def act(self, obs): #Cut out only the part needed partly = np.delete(obs, np.s_[96:], 0) partly = np.delete(partly, np.s_[0:26], 0) partly = np.delete(partly, np.s_[35:45], 0) partly = np.delete(partly, np.s_[38:53], 0) partly = np.delete(partly, np.s_[31:35], 0) partly = np.delete(partly, np.s_[10:16], 0) frame = np.delete(partly, np.s_[150:], 1) #scipy.misc.imsave('outfile.jpg', frame) return self.agent.act(frame) def load(self): import os if os.path.isdir(self.network_path): try: self.agent.restore_model(self.network_path) except: print("Failed to load model") def observe(self, terminal = False, reward = 0): return self.agent.observe(terminal, reward) def save_model(self): import os if not os.path.isdir(self.network_path): os.makedirs(self.network_path) self.agent.save_model(self.network_path)
def initialize(self, env): if self.algorithm == "ppo": return PPOAgent(states_spec=dict( type='float', shape=env.observation_space.shape), actions_spec=dict(type='int', num_actions=env.action_space.n), network_spec=[ dict(type='dense', size=64), dict(type='dense', size=64) ], batch_size=128, step_optimizer=dict(type='adam', learning_rate=1e-4)) return None
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=256, memory=dict( type='prioritized_replay', ), update_frequency=256, first_update=512, learning_rate=0.0001, optimizer_batch_size=64, normalize_rewards=False, gae_rewards=False, baseline=dict( type="mlp", sizes=[32, 32], epochs=1, update_batch_size=64, learning_rate=0.001 ), states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def __init__(self, state_size, env=None, is_eval=False): self.state_size = state_size self.action_size = 3 self._memory_size = 1000 self._memory = deque(maxlen=1000) self.inventory = pd.DataFrame(columns=['Price', 'POS', 'Order']) self.is_eval = is_eval self.learning_rate = env.hyperparameters['learning_rate'] self.gamma = env.hyperparameters['gamma'] self.env = env PPOAgent.__init__(self, states = dict(type='float', shape=self.state_size.shape), actions = dict(type='int', num_actions=self.action_size), network = env.get_network(), discount = self.gamma, batching_capacity = env.batch_size * 100, actions_exploration = env.exploration) #step_optimizer = self.get_optimizer(), #actions_exploration = self.explo) #update_mode = self._update_mode, #batching_capacity = self._memory_size) self._load_model()
def test_readme(self): environment = UnittestEnvironment(states=dict(type='float', shape=(10, )), actions=dict(type='int', num_values=5)) def get_current_state(): return environment.reset() def execute_decision(x): return environment.execute(actions=x)[2] # Instantiate a Tensorforce agent agent = PPOAgent(states=dict(type='float', shape=(10, )), actions=dict(type='int', num_values=5), memory=10000, network='auto', update_mode=dict(unit='episodes', batch_size=10), step_optimizer=dict(type='adam', learning_rate=1e-4)) # Initialize the agent agent.initialize() # Retrieve the latest (observable) environment state state = get_current_state() # (float array of shape [10]) # Query the agent for its action decision action = agent.act(states=state) # (scalar between 0 and 4) # Execute the decision and retrieve the current performance score reward = execute_decision(action) # (any scalar float) # Pass feedback about performance (and termination) to the agent agent.observe(reward=reward, terminal=False) agent.close() environment.close() self.assertTrue(expr=True)
class ForwardActor: def __init__(self): actions = {} for i in range(12): actions[str(i)] = {'type': 'float'} # 'num_actions': 10 network_spec = [ dict(type='dense', size=100, activation='relu'), dict(type='dense', size=100, activation='relu') ] self.agent = PPOAgent( states=dict(type='float', shape=(12, )), actions=actions, batching_capacity=2000, network=network_spec, step_optimizer=dict(type='adam', learning_rate=1e-4), ) def act(self, state): jp = np.expand_dims(np.nan_to_num(np.array(state["JointPosition"])), axis=0) jv = np.expand_dims(np.array(state["JointVelocity"]), axis=0) #actiondict = self.agent.act( np.concatenate([jp,jv],axis=1)) actiondict = self.agent.act(jp) action = np.zeros(12) for i in range(12): action[i] = actiondict[str(i)][0] action = np.nan_to_num(action) #print(action) return np.clip(action, -1.0, 1.0) def observe(self, reward, terminal): self.agent.observe(reward=reward, terminal=terminal) def save(self, directory): self.agent.save_model(directory=directory) def restore(self, directory): self.agent.restore_model(directory=directory)
def main(): # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v1') # Create the agent agent = PPOAgent( states=environment.states(), actions=environment.actions(), # Automatically configured network network='auto', # Memory sampling most recent experiences, with a capacity of 2500 timesteps # (6100 > [30 batch episodes] * [200 max timesteps per episode]) memory=6100, # Update every 10 episodes, with a batch of 30 episodes update_mode=dict(unit='episodes', batch_size=30, frequency=10), # PPO optimizer step_optimizer=dict(type='adam', learning_rate=1e-3), # PPO multi-step optimization: 10 updates, each based on a third of the batch subsampling_fraction=0.33, optimization_steps=10, # MLP baseline baseline_mode='states', baseline=dict(type='network', network='auto'), # Baseline optimizer baseline_optimizer=dict(type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5), # Other parameters discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=1000, max_episode_timesteps=200) runner.close()
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=4096, gae_lambda=0.97, learning_rate=0.001, entropy_penalty=0.01, epochs=5, optimizer_batch_size=512, loss_clipping=0.2, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]))) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)