def get_cartpole_agent(agent_name, cartpole_device): """ Build a new agent for the specified cartpole device. It would probably make more sense to pass agent_parameters as a parameter to this function. Parameters ---------- agent_name: str an identifier this function recognizes: "a2c" or "ppo" cartpole_device: Return ------ a tensorforce Agent """ if agent_name == "a2c": agent_parameters = dict( agent=agent_name, batch_size=11, variable_noise=0.1, l2_regularization= 0.05, # does this help with catastrophic forgetting? horizon=10, # 10 is good, 1 is bad, 5 is bad, 20 is ok, 15 is bad summarizer=dict( directory="data/summaries/" + agent_name, # list of labels, or 'all' labels=[ "graph", "entropy", "kl-divergence", "losses", "rewards" ], frequency=10, # store values every 10 timesteps ), ) agent = Agent.create( # agent="a2c", environment=cartpole_device.cartpole_env, # the cartpole environment will supply argument max_episode_timesteps # max_episode_timesteps=max_turns, **agent_parameters, ) elif agent_name == "ppo": agent_parameters = dict( batch_size=10, variable_noise=0.1, ) agent = Agent.create( agent="ppo", environment=cartpole_device.cartpole_env, **agent_parameters, ) else: raise ValueError(f"agent_name '{agent_name}' is not recognized") return agent, agent_parameters
def main(): bad_seeds_environment = Environment.create(environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100) agent = Agent.create( agent="a2c", batch_size=100, # this seems to help a2c horizon=20, # does this help a2c? exploration=0.01, # tried without this at first l2_regularization=0.1, entropy_regularization=0.2, variable_noise=0.05, environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_01_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=100000) agent.save(directory="saved_models")
def agent(self, agent: any): self._agent = Agent.create(agent=agent, environment=self._tensorforce_environment) self._runner = Runner(agent=self._agent, environment=self._tensorforce_environment, save_best_agent=self._save_best_agent)
def __init__(self, in_dim, n_action, rl, train): super().__init__() self.make_in_port('observation', in_dim) self.make_in_port('reward', 1) self.make_in_port('done', 1) self.make_out_port('action', 1) self.make_in_port('token_in', 1) self.make_out_port('token_out', 1) self.n_action = n_action # number of action choices self.results['action'] = np.array([np.random.randint(n_action)]) self.model = None self.env_type = "MotorEnv" self.token = 0 self.prev_actions = 0 self.init = True self.in_dim = in_dim self.rl = rl if rl: self.env = Environment.create( environment=MotorComponent.MotorEnv, max_episode_timesteps=train["episode_count"] * train["max_steps"], n_action=n_action, obs_dim=in_dim, parent=self) self.env.reset() self.agent = Agent.create(agent=train['rl_agent'], environment=self.env)
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, save_best_agent: bool = False, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=save_best_agent)
def agent(self, agent_spec: any): self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=self._save_best_agent)
def main(): bad_seeds_environment = Environment.create( environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100 ) agent = Agent.create( agent="a2c", batch_size=100, horizon=100, # changed from 20 to 100 for agent_03 exploration=0.05, # changed from 0.01 to 0.05 for agent_03 l2_regularization=0.2, # changed from 0.1 to 0.2 for agent_03 #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.1, # changed from 0.05 to 0.1 for agent_03 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_03_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_03_env_03/checkpoints', frequency=600 # save checkpoint every 600 seconds (10 minutes) ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) for _ in range(10): runner.run(num_episodes=10000) runner.run(num_episodes=1000, evaluation=True) bad_seeds_environment.close() agent.close()
def train_implementation(self, train_context: easyagents.core.StepsTrainContext): """Tensorforce Dqn Implementation of the train loop. The implementation follows https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py """ tc = train_context train_env = self._create_env() network = self._create_network_specification() agent_type = 'dqn' self.log_api( 'Agent.create', f'(agent="{agent_type}", ' + f'network={network}, ' + f'memory={tc.max_steps_in_buffer}, ' + f'start_updating={tc.num_steps_buffer_preload},' f'learning_rate={tc.learning_rate}, ' + f'batch_size={tc.num_steps_sampled_from_buffer}, ' + f'update_frequeny={tc.num_steps_per_iteration}, ' + f'discount={tc.reward_discount_gamma})') self._agent = Agent.create( agent=agent_type, environment=train_env, network=network, memory=tc.max_steps_in_buffer, start_updating=tc.num_steps_buffer_preload, learning_rate=tc.learning_rate, batch_size=tc.num_steps_sampled_from_buffer, update_frequency=tc.num_steps_per_iteration, discount=tc.reward_discount_gamma, ) self._train_with_runner(train_env, tc)
def set_up(): tensorflow_settings() env = Environment.create(environment=CartSeed01, seed_count=10, bad_seed_count=3, max_count=20) agent = Agent.create( agent="a2c", batch_size=10000, horizon=50, discount=0.97, l2_regularization=0.1, variable_noise=0.5, environment=env, summarizer=dict( directory="training_data/a2c_cartseed/summaries", labels="all", frequency=10, ), # saver=dict( # directory='saved_models/agent_04_env_04_1000/checkpoints', # frequency=600 # save checkpoint every 600 seconds (10 minutes) # ), ) return env, agent
def __init__( self, agent, environment, max_episode_timesteps=None, evaluation_environment=None, save_best_agent=None ): self.is_environment_external = isinstance(environment, Environment) self.environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps ) if evaluation_environment is None: self.evaluation_environment = None else: self.is_eval_environment_external = isinstance(evaluation_environment, Environment) self.evaluation_environment = Environment.create( environment=evaluation_environment, max_episode_timesteps=max_episode_timesteps ) assert self.evaluation_environment.states() == self.environment.states() assert self.evaluation_environment.actions() == self.environment.actions() self.is_agent_external = isinstance(agent, Agent) self.agent = Agent.create(agent=agent, environment=self.environment) self.save_best_agent = save_best_agent self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list()
def main(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="random", environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_random_env_02/summaries", labels="all", frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=10000) bad_seeds_environment.close() agent.close()
def set_up(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=500, ) agent = Agent.create( agent="dqn", network=[ dict(type='flatten'), dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ], environment=bad_seeds_environment, batch_size=256, memory=int(10**7), exploration=0.15, summarizer=dict( directory="training_data/agent_02_env_02/summaries", labels="all", frequency=100 # store values every 100 timesteps )) return bad_seeds_environment, agent
def prepare(self, environment=None, timestep_range=None, states=None, actions=None, exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, require_observe=False, require_all=False, **agent): """ Generic unit-test preparation. """ Layer.layers = None if environment is None: if states is None: states = deepcopy(self.__class__.states) if actions is None: actions = deepcopy(self.__class__.actions) if exclude_bool_action or self.__class__.exclude_bool_action: actions.pop('bool_action') if exclude_int_action or self.__class__.exclude_int_action: actions.pop('int_action') if exclude_float_action or self.__class__.exclude_float_action: actions.pop('float_action') if exclude_bounded_action or self.__class__.exclude_bounded_action: actions.pop('bounded_action') if timestep_range is None: timestep_range = self.__class__.timestep_range environment = UnittestEnvironment(states=states, actions=actions, timestep_range=timestep_range) elif timestep_range is not None: raise TensorforceError.unexpected() environment = Environment.create(environment=environment) for key, value in self.__class__.agent.items(): if key not in agent: agent[key] = value if self.__class__.require_all or require_all: config = None elif self.__class__.require_observe or require_observe: config = dict(api_functions=['reset', 'act', 'observe']) else: config = dict(api_functions=['reset', 'act']) agent = Agent.create(agent=agent, environment=environment, config=config) return agent, environment
def train_implementation(self, train_context: easyagents.core.PpoTrainContext): """Tensorforce Ppo Implementation of the train loop. The implementation follows https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py """ tc = train_context train_env = self._create_env() network = self._create_network_specification() self.log_api('Agent.create', f'(agent="ppo", environment=..., ' + f'network={network}' + f'learning_rate={tc.learning_rate}, ' + f'batch_size={tc.num_episodes_per_iteration}, ' + f'optimization_steps={tc.num_epochs_per_iteration}, ' + f'discount={tc.reward_discount_gamma})') self._agent = Agent.create( agent='ppo', environment=train_env, network=network, learning_rate=tc.learning_rate, batch_size=tc.num_episodes_per_iteration, optimization_steps=tc.num_epochs_per_iteration, discount=tc.reward_discount_gamma, ) self._train_with_runner(train_env, tc)
def __init__(self, agent, environment, evaluation_environment=None, save_best_agent=False): # save_best overwrites saver... self.is_environment_external = isinstance(environment, Environment) self.environment = Environment.create(environment=environment) self.is_eval_environment_external = isinstance(evaluation_environment, Environment) if evaluation_environment is None: self.evaluation_environment = None else: self.evaluation_environment = Environment.create( environment=evaluation_environment) self.save_best_agent = save_best_agent self.is_agent_external = isinstance(agent, Agent) kwargs = dict() if self.save_best_agent is True: # Disable periodic saving assert not self.is_agent_external kwargs = dict(saver=dict(seconds=None, steps=None)) self.agent = Agent.create(agent=agent, environment=self.environment, **kwargs) # self.global_episodes = self.agent.episodes # self.global_timesteps = self.agent.timesteps # self.global_updates = self.agent.updates self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list()
def set_up(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeedsSkinny, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="a2c", network=[ dict(type='flatten'), dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu') ], batch_size=10000, # changed for 04 but was this a mistake? no horizon=50, # changed from 100 to 50 for agent_04 discount=0.97, # new for agent_04 #exploration=0.05, # turned off for agent_04 - turn on for 05? l2_regularization=0.1, #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.5, # changed from 0.1 to 0.5 for agent_04 environment=bad_seeds_environment, summarizer=dict( directory="training_data/a2c_dense_skinny/summaries", # list of labels, or 'all' labels="all", frequency=100, # store values every 100 timesteps ), ) return bad_seeds_environment, agent
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._save_best_agent = kwargs.get('save_best_agent', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create( agent=agent_spec, environment=self._environment, summarizer=dict( directory='data/summaries', labels=['graph', 'losses', 'rewards'], # list of labels, or 'all' frequency=100 # store values every 100 timesteps # (infrequent update summaries every update; other configurations possible) ), ) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=self._save_best_agent)
def base_test(env): batch_size = 24 agent = Agent.create( agent='ppo', environment=env[0], batch_size=batch_size, learning_rate=1e-3, network=actor_network, discount=1.0, entropy_regularization=None, critic_network=critic_network, critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), max_episode_timesteps=n_step, parallel_interactions=n_env # saver=dict(directory=os.path.join(os.getcwd(), 'saver_data'), frequency=30) ) agent.initialize() # Initialize the runner runner = ParallelRunner(agent=agent, environments=env) # Start the runner runner.run(num_episodes=48) runner.close()
def main(): # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='dqn', environment=environment, # memory=100, # # Optimization # batch_size=10, update_frequency=2, learning_rate=1e-3, summarizer=dict( directory='data/summaries', # list of labels, or 'all' labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'], frequency=100 # store values every 100 timesteps # (infrequent update summaries every update; other configurations possible) ), recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=10000) runner.close()
def runEnv(): environment = Environment.create( environment=CustomEnvironment, max_episode_timesteps=500 ) agent = Agent.create(agent='a2c', environment=environment, batch_size=10, learning_rate=1e-3) # Train for 200 episodes for _ in range(2000): states = environment.reset() terminal = False while CustomEnvironment.extraCounter != 100: actions = agent.act(states=states) # print(actions) # print(states) states, reward, terminal = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(1000): states = environment.reset() internals = agent.initial_internals() terminal = False while CustomEnvironment.extraCounter != 100: actions, internals = agent.act(states=states, internals=internals, independent=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward # print('Mean episode reward:', sum_rewards / 100) # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount) print(CustomEnvironment.sum) # Close agent and environment agent.close() environment.close()
def run_no_runner(environment, nplayers): #with open("rl-regenwormen/agent.json", 'r') as fp: # agent = json.load(fp=fp) agents = [ Agent.create(agent='ppo', batch_size=100, learning_rate=1e-3, exploration=0.2, environment=environment, summarizer=dict(directory='summaries', summaries='all')) for i in range(nplayers) ] print("starting training...") i = 10000000 bar = Bar('Training', max=i) rewards = {i: 0 for i in range(nplayers)} rewards_total = {i: [] for i in range(nplayers)} for episode in range(30000): for agent in agents: agent.reset() states = environment.reset() terminal = False while not terminal: try: agent = agents[environment.current_player] current_player = environment.current_player actions = agent.act(states=states) #print(actions) states, terminal, reward = environment.execute(actions=actions) rewards[environment.current_player] += reward rewards_total[environment.current_player] += [reward] rewards_total[environment.current_player] = rewards_total[ environment.current_player][-300:] end_of_roll = environment.current_player != current_player agent.observe(terminal=end_of_roll, reward=reward) if terminal: for agent2 in agents: if agent2 != agent: actions = agent2.act(states=states) states, terminal, reward = environment.execute( actions=actions) agent2.observe(terminal=True, reward=reward) except: print(f"ENV {environment.state}") print(f"ACT {actions}") print(states) raise names = ["lola", "henry de muis", "pykel", "flo"] print({ names[k]: (int(v * 100) / 100, int(np.mean(rewards_total[k]) * 100) / 100) for k, v in rewards.items() }) rewards = {i: 0 for i in range(nplayers)} bar.next() bar.finish()
def test_quickstart(self): self.start_tests(name='quickstart') # ==================== # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='ppo', environment=environment, # Automatically configured network network='auto', # Optimization batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2, optimization_steps=5, # Reward estimation likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, # Critic critic_network='auto', critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), # Preprocessing preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, summarizer=None, recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=50, use_tqdm=False) runner.close() # ==================== self.finished_test()
def create_deepcrawl_agent(net, baseline, states, actions, args, size_agent=14, size_target=4, num_action=8, num_embs=128): agent = Agent.create( # Agent type agent='ppo', # Inputs structure states=states, # Actions structure actions=actions, network=net, # MemoryModel # 10 episodes per update batch_size=int(args.update_number), # Every 10 episodes update_frequency=int(args.update_number), max_episode_timesteps=int(args.num_timesteps), # DistributionModel discount=0.9, entropy_regularization=0.00, likelihood_ratio_clipping=0.2, critic_network=baseline, critic_optimizer=dict(type='multi_step', optimizer=dict(type='subsampling_step', fraction=0.33, optimizer=dict( type='adam', learning_rate=5e-4)), num_steps=10), # PPOAgent learning_rate=5e-5, subsampling_fraction=0.33, optimization_steps=20, execution=None, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, saver=None, summarizer=None, recorder=None) return agent
def __init__(self, agent, environments, evaluation_environment=None, save_best_agent=False): # save_best overwrites saver... if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) self.is_environment_external = tuple( isinstance(environment, Environment) for environment in environments) self.environments = tuple( Environment.create(environment=environment) for environment in environments) self.is_eval_environment_external = isinstance(evaluation_environment, Environment) if evaluation_environment is None: self.evaluation_environment = None else: self.evaluation_environment = Environment.create( environment=evaluation_environment) self.save_best_agent = save_best_agent self.is_agent_external = isinstance(agent, Agent) kwargs = dict(parallel_interactions=len(environments)) # warning: save_best_agent if not self.is_agent_external and self.save_best_agent: # Disable periodic saving kwargs = dict(saver=dict(seconds=None, steps=None)) self.agent = Agent.create(agent=agent, environment=self.environments[0], **kwargs) if not self.agent.model.is_initialized: self.agent.initialize() # self.global_episodes = self.agent.episodes # self.global_timesteps = self.agent.timesteps # self.global_updates = self.agent.updates self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list() self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list()
def runEnv(): environment = Environment.create(environment=CustomEnvironment, max_episode_timesteps=500) agent = Agent.create( agent='a2c', environment=environment, batch_size=10, learning_rate=1e-3, exploration=0.01, # tried without this at first variable_noise=0.05, # variable_noise=0.01 bad? l2_regularization=0.1, entropy_regularization=0.2, summarizer=dict( directory='data/summaries', # list of labels, or 'all' labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'], frequency=100, # store values every 100 timesteps )) # Train for 200 episodes for _ in range(CustomEnvironment.trainingEps): print("Episode: ", _) states = environment.reset() terminal = False while CustomEnvironment.extraCounter != CustomEnvironment.trials: actions = agent.act(states=states) # print(actions) # print(states) states, reward, terminal = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) print("bad seeds: ", CustomEnvironment.badseedsFinal) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(CustomEnvironment.testingEps): print("Episode: ", _ + CustomEnvironment.trainingEps) states = environment.reset() internals = agent.initial_internals() terminal = False while CustomEnvironment.extraCounter != CustomEnvironment.trials: actions, internals = agent.act(states=states, internals=internals, independent=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward print("bad seeds: ", CustomEnvironment.badseedsFinal) # print('Mean episode reward:', sum_rewards / 100) # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount) print(CustomEnvironment.sum) # Close agent and environment agent.close() environment.close()
def main(): # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') network = _create_network_specification((100, )) # Create a PPO agent agent = Agent.create(agent='dueling_dqn', environment=environment, network=network) runner = Runner(agent=agent, environment=environment) runner.run(num_episodes=10000) runner.close()
def test_getting_started(self): from tensorforce.agents import Agent from tensorforce.environments import Environment # Setup environment # (Tensorforce or custom implementation, ideally using the Environment interface) environment = Environment.create( environment='test/data/environment.json') # Create and initialize agent agent = Agent.create(agent='test/data/agent.json', environment=environment) agent.initialize() # Reset agent and environment at the beginning of a new episode agent.reset() states = environment.reset() terminal = False # Agent-environment interaction training loop while not terminal: actions = agent.act(states=states) states, terminal, reward = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # ==================== # Agent-environment interaction evaluation loop while not terminal: actions = agent.act(states=states, evaluation=True) states, terminal, reward = environment.execute(actions=actions) # ==================== # Close agent and environment agent.close() environment.close() # ==================== from tensorforce.execution import Runner # Tensorforce runner utility runner = Runner(agent='test/data/agent.json', environment='test/data/environment.json') # Run training runner.run(num_episodes=50, use_tqdm=False) # Close runner runner.close() self.finished_test()
def main(): bad_seeds_environment = Environment.create(environment=Bollux, seed_count=10, bad_seed_count=3, max_episode_length=100) # 20200820-223031 # 20200820-233243 # batch_size 1000 goes not get smarter or dumber # batch_size 100 20200821-095410 gets dumber # try batch size 10000 ! agent = Agent.create( agent="a2c", batch_size=10000, # changed for 04 but was this a mistake? no horizon=50, # changed from 100 to 50 for agent_04 discount=0.97, # new for agent_04 #exploration=0.05, # turned off for agent_04 - turn on for 05? l2_regularization=0.1, #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.5, # changed from 0.1 to 0.5 for agent_04 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_04_bollux_1000000/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_04_bollux_1000000/checkpoints', frequency=6000 # save checkpoint every 6000 seconds (100 minutes) ), ) # this is the batch_size = 10000 version # I hope it is the last env 04 runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=1000000) #for i in range(100): # print("running 10000 episodes") # runner.run(num_episodes=10000) # print("saving the agent") # directory = Path(f"saved_models/agent_04_env_04_1000000/10000_{i}/checkpoints") # if directory.exists(): # directory.rmdir() # directory.mkdir(parents=True, exist_ok=True) # agent.save(directory=str(directory), format="numpy") bad_seeds_environment.close() agent.close()
def test_readme(self): self.start_tests(name='readme') environment = UnittestEnvironment(states=dict(type='float', shape=(10, )), actions=dict(type='int', shape=(), num_values=5), timestep_range=(1, 5)) def get_current_state(): return environment.reset() def execute_decision(x): return environment.execute(actions=x)[2] # ========== from tensorforce.agents import Agent # Instantiate a Tensorforce agent agent = Agent.create(agent='tensorforce', states=dict(type='float', shape=(10, )), actions=dict(type='int', num_values=5), max_episode_timesteps=100, memory=10000, update=dict(unit='timesteps', batch_size=64), optimizer=dict(type='adam', learning_rate=3e-4), policy=dict(network='auto'), objective='policy_gradient', reward_estimation=dict(horizon=20)) # Initialize the agent agent.initialize() # Retrieve the latest (observable) environment state state = get_current_state() # (float array of shape [10]) # Query the agent for its action decision action = agent.act(states=state) # (scalar between 0 and 4) # Execute the decision and retrieve the current performance score reward = execute_decision(action) # (any scalar float) # Pass feedback about performance (and termination) to the agent agent.observe(reward=reward, terminal=False) # ========== agent.close() environment.close() self.finished_test()
def main(): # Create a Gym environment in FruitAPI and converts it to a TensorForce environment fruit_env = GymEnvironment(env_name='CartPole-v1') environment = TensorForcePlugin.convert(fruit_env) # Create a PPO agent agent = Agent.create( agent='ppo', environment=environment, # Automatically configured network network='auto', # Optimization batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2, optimization_steps=5, # Reward estimation likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, # Critic critic_network='auto', critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), # Preprocessing preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, summarizer=None, recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=300) runner.close()