def get_cartpole_agent(agent_name, cartpole_device): """ Build a new agent for the specified cartpole device. It would probably make more sense to pass agent_parameters as a parameter to this function. Parameters ---------- agent_name: str an identifier this function recognizes: "a2c" or "ppo" cartpole_device: Return ------ a tensorforce Agent """ if agent_name == "a2c": agent_parameters = dict( agent=agent_name, batch_size=11, variable_noise=0.1, l2_regularization= 0.05, # does this help with catastrophic forgetting? horizon=10, # 10 is good, 1 is bad, 5 is bad, 20 is ok, 15 is bad summarizer=dict( directory="data/summaries/" + agent_name, # list of labels, or 'all' labels=[ "graph", "entropy", "kl-divergence", "losses", "rewards" ], frequency=10, # store values every 10 timesteps ), ) agent = Agent.create( # agent="a2c", environment=cartpole_device.cartpole_env, # the cartpole environment will supply argument max_episode_timesteps # max_episode_timesteps=max_turns, **agent_parameters, ) elif agent_name == "ppo": agent_parameters = dict( batch_size=10, variable_noise=0.1, ) agent = Agent.create( agent="ppo", environment=cartpole_device.cartpole_env, **agent_parameters, ) else: raise ValueError(f"agent_name '{agent_name}' is not recognized") return agent, agent_parameters
def __init__(self, agent_spec, agent_additional_parameters, agents_count): self.agents = [] first_agent = Agent.from_spec(spec=agent_spec, kwargs=agent_additional_parameters) self.agents.append(first_agent) self.model = first_agent.model self.stop = False for _ in range(agents_count - 1): agent = Agent.from_spec(spec=agent_spec, kwargs=agent_additional_parameters) agent.model.close() agent.model = self.model self.agents.append(agent)
def __init__(self, in_dim, n_action, rl, train): super().__init__() self.make_in_port('observation', in_dim) self.make_in_port('reward', 1) self.make_in_port('done', 1) self.make_out_port('action', 1) self.make_in_port('token_in', 1) self.make_out_port('token_out', 1) self.n_action = n_action # number of action choices self.results['action'] = np.array([np.random.randint(n_action)]) self.model = None self.env_type = "MotorEnv" self.token = 0 self.prev_actions = 0 self.init = True self.in_dim = in_dim self.rl = rl if rl: self.env = Environment.create( environment=MotorComponent.MotorEnv, max_episode_timesteps=train["episode_count"] * train["max_steps"], n_action=n_action, obs_dim=in_dim, parent=self) self.env.reset() self.agent = Agent.create(agent=train['rl_agent'], environment=self.env)
def main(): bad_seeds_environment = Environment.create(environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100) agent = Agent.create( agent="a2c", batch_size=100, # this seems to help a2c horizon=20, # does this help a2c? exploration=0.01, # tried without this at first l2_regularization=0.1, entropy_regularization=0.2, variable_noise=0.05, environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_01_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=100000) agent.save(directory="saved_models")
def agent(self, agent: any): self._agent = Agent.create(agent=agent, environment=self._tensorforce_environment) self._runner = Runner(agent=self._agent, environment=self._tensorforce_environment, save_best_agent=self._save_best_agent)
def __init__(self, agent, environments): if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) if not isinstance(agent, Agent): agent = Agent.from_spec(spec=agent, states=environments[0].states(), actions=environments[0].actions(), parallel_interactions=len(environments)) if len(environments) > agent.parallel_interactions: raise TensorforceError(message="Too many environments.") self.agent = agent self.environments = tuple(environments) self.agent.initialize() self.global_episode = self.agent.episode self.global_timestep = self.agent.timestep self.episode_rewards = list() self.episode_timesteps = list() self.episode_times = list()
def train_implementation(self, train_context: easyagents.core.PpoTrainContext): """Tensorforce Ppo Implementation of the train loop. The implementation follows https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py """ tc = train_context train_env = self._create_env() network = self._create_network_specification() self.log_api('Agent.create', f'(agent="ppo", environment=..., ' + f'network={network}' + f'learning_rate={tc.learning_rate}, ' + f'batch_size={tc.num_episodes_per_iteration}, ' + f'optimization_steps={tc.num_epochs_per_iteration}, ' + f'discount={tc.reward_discount_gamma})') self._agent = Agent.create( agent='ppo', environment=train_env, network=network, learning_rate=tc.learning_rate, batch_size=tc.num_episodes_per_iteration, optimization_steps=tc.num_epochs_per_iteration, discount=tc.reward_discount_gamma, ) self._train_with_runner(train_env, tc)
def __init__(self, agent, environment, evaluation_environment=None, save_best_agent=False): # save_best overwrites saver... self.is_environment_external = isinstance(environment, Environment) self.environment = Environment.create(environment=environment) self.is_eval_environment_external = isinstance(evaluation_environment, Environment) if evaluation_environment is None: self.evaluation_environment = None else: self.evaluation_environment = Environment.create( environment=evaluation_environment) self.save_best_agent = save_best_agent self.is_agent_external = isinstance(agent, Agent) kwargs = dict() if self.save_best_agent is True: # Disable periodic saving assert not self.is_agent_external kwargs = dict(saver=dict(seconds=None, steps=None)) self.agent = Agent.create(agent=agent, environment=self.environment, **kwargs) # self.global_episodes = self.agent.episodes # self.global_timesteps = self.agent.timesteps # self.global_updates = self.agent.updates self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list()
def agent(self, agent_spec: any): self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=self._save_best_agent)
def __init__(self, environment: TradingEnvironment, agent_spec: Dict = None, network_spec: Dict = None, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent_spec: A specification dictionary for the `Tensorforce` agent. network_sepc: A specification dictionary for the `Tensorforce` agent's model network. kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._environment = environment self._max_episode_timesteps = kwargs.get('max_episode_timesteps', None) if agent_spec and network_spec: self._agent_spec = agent_spec self._network_spec = network_spec self._agent = Agent.from_spec(spec=agent_spec, kwargs=dict( network=network_spec, states=environment.states, actions=environment.actions)) self._runner = Runner(agent=self._agent, environment=environment)
def build(agent_spec, actor, env): agent = Agent.from_spec(spec=agent_spec, kwargs=dict(states=env.states, actions=env.actions, network=actor)) runner = Runner(agent=agent, environment=env, repeat_actions=1) return runner, agent
def prepare(self, environment=None, timestep_range=None, states=None, actions=None, exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, require_observe=False, require_all=False, **agent): """ Generic unit-test preparation. """ Layer.layers = None if environment is None: if states is None: states = deepcopy(self.__class__.states) if actions is None: actions = deepcopy(self.__class__.actions) if exclude_bool_action or self.__class__.exclude_bool_action: actions.pop('bool_action') if exclude_int_action or self.__class__.exclude_int_action: actions.pop('int_action') if exclude_float_action or self.__class__.exclude_float_action: actions.pop('float_action') if exclude_bounded_action or self.__class__.exclude_bounded_action: actions.pop('bounded_action') if timestep_range is None: timestep_range = self.__class__.timestep_range environment = UnittestEnvironment(states=states, actions=actions, timestep_range=timestep_range) elif timestep_range is not None: raise TensorforceError.unexpected() environment = Environment.create(environment=environment) for key, value in self.__class__.agent.items(): if key not in agent: agent[key] = value if self.__class__.require_all or require_all: config = None elif self.__class__.require_observe or require_observe: config = dict(api_functions=['reset', 'act', 'observe']) else: config = dict(api_functions=['reset', 'act']) agent = Agent.create(agent=agent, environment=environment, config=config) return agent, environment
def set_up(): tensorflow_settings() env = Environment.create(environment=CartSeed01, seed_count=10, bad_seed_count=3, max_count=20) agent = Agent.create( agent="a2c", batch_size=10000, horizon=50, discount=0.97, l2_regularization=0.1, variable_noise=0.5, environment=env, summarizer=dict( directory="training_data/a2c_cartseed/summaries", labels="all", frequency=10, ), # saver=dict( # directory='saved_models/agent_04_env_04_1000/checkpoints', # frequency=600 # save checkpoint every 600 seconds (10 minutes) # ), ) return env, agent
def __init__( self, agent, environment, max_episode_timesteps=None, evaluation_environment=None, save_best_agent=None ): self.is_environment_external = isinstance(environment, Environment) self.environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps ) if evaluation_environment is None: self.evaluation_environment = None else: self.is_eval_environment_external = isinstance(evaluation_environment, Environment) self.evaluation_environment = Environment.create( environment=evaluation_environment, max_episode_timesteps=max_episode_timesteps ) assert self.evaluation_environment.states() == self.environment.states() assert self.evaluation_environment.actions() == self.environment.actions() self.is_agent_external = isinstance(agent, Agent) self.agent = Agent.create(agent=agent, environment=self.environment) self.save_best_agent = save_best_agent self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list()
def main(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="random", environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_random_env_02/summaries", labels="all", frequency=100, # store values every 100 timesteps ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) runner.run(num_episodes=10000) bad_seeds_environment.close() agent.close()
def base_test(env): batch_size = 24 agent = Agent.create( agent='ppo', environment=env[0], batch_size=batch_size, learning_rate=1e-3, network=actor_network, discount=1.0, entropy_regularization=None, critic_network=critic_network, critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), max_episode_timesteps=n_step, parallel_interactions=n_env # saver=dict(directory=os.path.join(os.getcwd(), 'saver_data'), frequency=30) ) agent.initialize() # Initialize the runner runner = ParallelRunner(agent=agent, environments=env) # Start the runner runner.run(num_episodes=48) runner.close()
def runEnv(): environment = Environment.create( environment=CustomEnvironment, max_episode_timesteps=500 ) agent = Agent.create(agent='a2c', environment=environment, batch_size=10, learning_rate=1e-3) # Train for 200 episodes for _ in range(2000): states = environment.reset() terminal = False while CustomEnvironment.extraCounter != 100: actions = agent.act(states=states) # print(actions) # print(states) states, reward, terminal = environment.execute(actions=actions) agent.observe(terminal=terminal, reward=reward) # Evaluate for 100 episodes sum_rewards = 0.0 for _ in range(1000): states = environment.reset() internals = agent.initial_internals() terminal = False while CustomEnvironment.extraCounter != 100: actions, internals = agent.act(states=states, internals=internals, independent=True) states, terminal, reward = environment.execute(actions=actions) sum_rewards += reward # print('Mean episode reward:', sum_rewards / 100) # print(CustomEnvironment.firstCount, ",", CustomEnvironment.secondCount, ",", CustomEnvironment.thirdCount) print(CustomEnvironment.sum) # Close agent and environment agent.close() environment.close()
def main(): bad_seeds_environment = Environment.create( environment=BadSeeds03, seed_count=10, bad_seed_count=3, max_episode_length=100 ) agent = Agent.create( agent="a2c", batch_size=100, horizon=100, # changed from 20 to 100 for agent_03 exploration=0.05, # changed from 0.01 to 0.05 for agent_03 l2_regularization=0.2, # changed from 0.1 to 0.2 for agent_03 #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.1, # changed from 0.05 to 0.1 for agent_03 environment=bad_seeds_environment, summarizer=dict( directory="training_data/agent_03_env_03/summaries", # list of labels, or 'all' labels=["graph", "entropy", "kl-divergence", "losses", "rewards"], frequency=100, # store values every 100 timesteps ), saver=dict( directory='saved_models/agent_03_env_03/checkpoints', frequency=600 # save checkpoint every 600 seconds (10 minutes) ), ) runner = Runner(agent=agent, environment=bad_seeds_environment) for _ in range(10): runner.run(num_episodes=10000) runner.run(num_episodes=1000, evaluation=True) bad_seeds_environment.close() agent.close()
def main(): # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='dqn', environment=environment, # memory=100, # # Optimization # batch_size=10, update_frequency=2, learning_rate=1e-3, summarizer=dict( directory='data/summaries', # list of labels, or 'all' labels=['graph', 'entropy', 'kl-divergence', 'losses', 'rewards'], frequency=100 # store values every 100 timesteps # (infrequent update summaries every update; other configurations possible) ), recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=10000) runner.close()
def set_up(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeeds02, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=500, ) agent = Agent.create( agent="dqn", network=[ dict(type='flatten'), dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ], environment=bad_seeds_environment, batch_size=256, memory=int(10**7), exploration=0.15, summarizer=dict( directory="training_data/agent_02_env_02/summaries", labels="all", frequency=100 # store values every 100 timesteps )) return bad_seeds_environment, agent
def __init__(self, env=None, device=None): self.env = env if self.env.saver.model_file_name == "": try: self.env.saver.model_file_name = self.env.model_name + "_" + self.env.dataDirectory.replace("/", "") except: self.env.saver.model_file_name = self.env.model_name + "_" + self.env.dataDirectory.replace("/", "") if not os.path.exists(self.env.saver.model_directory+ "/model"): os.mkdir(self.env.saver.model_directory+ "/model") self.env.saver.model_file_path = self.env.saver.model_directory + "/model/" + self.env.saver.model_file_name self.agent = Agents.from_spec( self.env.settings['agent'], kwargs=dict( states=self.env.states, actions=dict(type='int', num_actions=self.env.actions), network=self.env.settings['network'], device=device ) ) try: self.agent.restore_model(self.env.saver.model_directory+"/model") except: pass
def train_implementation(self, train_context: easyagents.core.StepsTrainContext): """Tensorforce Dqn Implementation of the train loop. The implementation follows https://github.com/tensorforce/tensorforce/blob/master/examples/quickstart.py """ tc = train_context train_env = self._create_env() network = self._create_network_specification() agent_type = 'dqn' self.log_api( 'Agent.create', f'(agent="{agent_type}", ' + f'network={network}, ' + f'memory={tc.max_steps_in_buffer}, ' + f'start_updating={tc.num_steps_buffer_preload},' f'learning_rate={tc.learning_rate}, ' + f'batch_size={tc.num_steps_sampled_from_buffer}, ' + f'update_frequeny={tc.num_steps_per_iteration}, ' + f'discount={tc.reward_discount_gamma})') self._agent = Agent.create( agent=agent_type, environment=train_env, network=network, memory=tc.max_steps_in_buffer, start_updating=tc.num_steps_buffer_preload, learning_rate=tc.learning_rate, batch_size=tc.num_steps_sampled_from_buffer, update_frequency=tc.num_steps_per_iteration, discount=tc.reward_discount_gamma, ) self._train_with_runner(train_env, tc)
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, save_best_agent: bool = False, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=save_best_agent)
def set_up(): tensorflow_settings() bad_seeds_environment = Environment.create( environment=BadSeedsSkinny, seed_count=10, bad_seed_count=3, history_block=2, max_episode_timesteps=100, ) agent = Agent.create( agent="a2c", network=[ dict(type='flatten'), dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu') ], batch_size=10000, # changed for 04 but was this a mistake? no horizon=50, # changed from 100 to 50 for agent_04 discount=0.97, # new for agent_04 #exploration=0.05, # turned off for agent_04 - turn on for 05? l2_regularization=0.1, #entropy_regularization=0.2, # turned off for agent_03 variable_noise=0.5, # changed from 0.1 to 0.5 for agent_04 environment=bad_seeds_environment, summarizer=dict( directory="training_data/a2c_dense_skinny/summaries", # list of labels, or 'all' labels="all", frequency=100, # store values every 100 timesteps ), ) return bad_seeds_environment, agent
def __init__(self, environment: 'TradingEnvironment', agent_spec: any, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._save_best_agent = kwargs.get('save_best_agent', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create( agent=agent_spec, environment=self._environment, summarizer=dict( directory='data/summaries', labels=['graph', 'losses', 'rewards'], # list of labels, or 'all' frequency=100 # store values every 100 timesteps # (infrequent update summaries every update; other configurations possible) ), ) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=self._save_best_agent)
def restore_agent(self, path: str, model_path: str = None): """Deserialize the strategy's learning agent from a file. Arguments: path: The `str` path of the file the agent specification is stored in. The `.json` file extension will be automatically appended if not provided. model_path (optional): The `str` path of the file or directory the agent checkpoint is stored in. If not provided, the `model_path` will default to `{path_without_dot_json}/agents`. """ path_with_ext = path if path.endswith('.json') else f'{path}.json' with open(path_with_ext) as json_file: spec = json.load(json_file) self._agent_spec = spec.agent self._network_spec = spec.network self._agent = Agent.from_spec(spec=self._agent_spec, kwargs=dict( network=self._network_spec, states=self._environment.states, actions=self._environment.actions)) path_without_ext = path_with_ext.replace('.json', '') model_path = model_path or f'{path_without_ext}/agent' self._agent.restore_model(file=model_path) self._runner = Runner(agent=self._agent, environment=self._environment)
def test_quickstart(self): self.start_tests(name='quickstart') # ==================== # Create an OpenAI-Gym environment environment = Environment.create(environment='gym', level='CartPole-v1') # Create a PPO agent agent = Agent.create( agent='ppo', environment=environment, # Automatically configured network network='auto', # Optimization batch_size=10, update_frequency=2, learning_rate=1e-3, subsampling_fraction=0.2, optimization_steps=5, # Reward estimation likelihood_ratio_clipping=0.2, discount=0.99, estimate_terminal=False, # Critic critic_network='auto', critic_optimizer=dict(optimizer='adam', multi_step=10, learning_rate=1e-3), # Preprocessing preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, summarizer=None, recorder=None) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Start the runner runner.run(num_episodes=50, use_tqdm=False) runner.close() # ==================== self.finished_test()
def run_no_runner(environment, nplayers): #with open("rl-regenwormen/agent.json", 'r') as fp: # agent = json.load(fp=fp) agents = [ Agent.create(agent='ppo', batch_size=100, learning_rate=1e-3, exploration=0.2, environment=environment, summarizer=dict(directory='summaries', summaries='all')) for i in range(nplayers) ] print("starting training...") i = 10000000 bar = Bar('Training', max=i) rewards = {i: 0 for i in range(nplayers)} rewards_total = {i: [] for i in range(nplayers)} for episode in range(30000): for agent in agents: agent.reset() states = environment.reset() terminal = False while not terminal: try: agent = agents[environment.current_player] current_player = environment.current_player actions = agent.act(states=states) #print(actions) states, terminal, reward = environment.execute(actions=actions) rewards[environment.current_player] += reward rewards_total[environment.current_player] += [reward] rewards_total[environment.current_player] = rewards_total[ environment.current_player][-300:] end_of_roll = environment.current_player != current_player agent.observe(terminal=end_of_roll, reward=reward) if terminal: for agent2 in agents: if agent2 != agent: actions = agent2.act(states=states) states, terminal, reward = environment.execute( actions=actions) agent2.observe(terminal=True, reward=reward) except: print(f"ENV {environment.state}") print(f"ACT {actions}") print(states) raise names = ["lola", "henry de muis", "pykel", "flo"] print({ names[k]: (int(v * 100) / 100, int(np.mean(rewards_total[k]) * 100) / 100) for k, v in rewards.items() }) rewards = {i: 0 for i in range(nplayers)} bar.next() bar.finish()
def __init__(self, agent_type, network, action_type, preprocessor_type, reward, tag): rf = reward_functions.__dict__[reward] super().__init__("-".join( [agent_type, network, action_type, reward, tag])) agent_spec = create_spec(action_type, agent_type, network) self._tf_agent = Agent.from_spec(agent_spec, {}) self.action_translator = get_action_translator(action_type) self.preprocessor = get_observation_preprocessor(preprocessor_type)
def restore_agent(self, directory: str, filename: str = None): """Deserialize the strategy's learning agent from a file. Arguments: directory: The `str` path of the directory the agent checkpoint is stored in. filename (optional): The `str` path of the file the agent specification is stored in. The `.json` file extension will be automatically appended if not provided. """ self._agent = Agent.load(directory, filename=filename) self._runner = Runner(agent=self._agent, environment=self._environment)