def dqn_train(args): seasonals = (args.environment == 'seasonals-v1') save_dir = os.path.join(args.folder, args.agent_name) train_env = EnvWrap(gym.make('seasonals-v1'), batched=False, subep_len=252, num_subeps=5 ) if seasonals else OpenAIGym(env_name) test_env = EnvWrap(gym.make('seasonals-v1') ) if seasonals else OpenAIGym(env_name, monitor_video=1, monitor=os.path.join(save_dir, 'monitoring')) agent = setup_agent(train_env.states, train_env.actions, int(layer_1_size), int(layer_2_size), layer_1_activation, layer_2_activation, True if has_third_layer=='True' else False, float(learning_rate), float(baseline_learning_rate), save_dir=save_dir) rewards, test_rewards, test_episodes = train( agent, train_env, num_episodes=args.num_episodes) agent.close() train_env.close() plot_rewards(rewards, test_rewards=test_rewards, test_episodes=test_episodes) loss, history = test(agent, test_env) graph_episode(history)
def _generate_episode_data(episode_id, gym_id, monitor): if episode_id % 100 == 0: print('Computing game', episode_id) try: environment = OpenAIGym( gym_id=gym_id, monitor=monitor if episode_id == 0 else None, monitor_video=1 if episode_id == 0 else 0 ) state = environment.reset() world = environment.gym.unwrapped.world interface = VincentSalimInterface() interface.start(world) episode = [] while True: interface.feed(world) action = interface.get_moves(world, 0) episode.append((state, action)) state, terminal, step_reward = environment.execute(action) if terminal: break return episode except Exception as e: print('An exception occurred during game generation!', e) return []
def main(): env = OpenAIGym("P3DX-v0") agent = DQNAgent(states=dict(type='float', shape=(80, 80, 4)), actions=dict(type='int', num_actions=7), network=[ dict(type="conv2d", size=16, window=[8, 8], stride=4, activation="relu"), dict(type="conv2d", size=32, window=[4, 4], stride=2, activation="relu"), dict(type="flatten"), dict(type="dense", size=256) ], actions_exploration=dict(type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.1, timesteps=1000), memory=dict(type="replay", capacity=1000, include_next_states=True), update_mode=dict(unit="timesteps", batch_size=16, frequency=4), discount=0.99, entropy_regularization=None, double_q_model=True, optimizer=dict(type="adam", learning_rate=1e-4)) try: agent.restore_model(directory="modelo/", file="data-129235") print("Found data!") except Exception as e: print(e) print("Can't load data") print("Starting execution") state = env.reset() agent.reset() try: while True: # Get action - no exploration and no observing action = agent.act(state, deterministic=True, independent=True) print(action) # Execute action in the environment state, terminal_state, reward = env.execute(action) if terminal_state: raise KeyboardInterrupt except KeyboardInterrupt: print("Terminal state", terminal_state) state = env.reset() agent.reset()
def main(): #tensorforce env = OpenAIGym('JacoArm-v0') agent = TRPOAgent(states_spec=env.states, actions_spec=env.actions, network_spec=network_spec, batch_size=512) # agent = PPOAgent( # states_spec=env.states, # actions_spec=env.actions, # network_spec=network_spec, # batch_size=512, # step_optimizer=dict( # type='adam', # learning_rate=1e-4 # ) # ) runner = Runner(agent=agent, environment=env) raw_input("hit enter when gazebo is loaded...") print() env.gym.unpause() env.gym.hold_init_robot_pos([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0]) runner.run(episodes=1500, max_episode_timesteps=1000, episode_finished=episode_finished) #old-fashioned way # env = gym.make('JacoArm-v0') # print "launching the world..." # #gz loaing issues, let user start the learning # raw_input("hit enter when gazebo is loaded...") # env.set_physics_update(0.0001, 10000) # raw_input("hit enter when gazebo is loaded...") # # env.set_goal([0.167840578046, 0.297489331432, 0.857454500127]) # total_episodes = 100 # action = [1,1,1,1,1,1,1,1,1,1] # x = 0 # # for x in range(total_episodes): # while True: # # if x % 10 is 0: # action = numpy.random.rand(1, 10)[0] # # print 'new action is', action # state, reward, done, _ = env.step(action) # print reward # time.sleep(0.2) # x += 1 write_to_csv(train_data, 'test.csv') env.close()
def __init__(self, game, state=None): self.game = game if state is None: self.gym = retro.make(game) else: self.gym = retro.make(game, state=state) self.visualize = False self.states_spec = OpenAIGym.specs_from_gym_space( space=self.gym.observation_space, ignore_value_bounds=True) self.actions_spec = OpenAIGym.specs_from_gym_space( space=self.gym.action_space, ignore_value_bounds=False)
def main(): gym_id = 'CartPole-v0' max_episodes = 10000 max_timesteps = 1000 env = OpenAIGym(gym_id) network_spec = [ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ] agent = DQNAgent( states_spec=env.states, actions_spec=env.actions, network_spec=network_spec, batch_size=64 ) runner = Runner(agent, env) report_episodes = 10 def episode_finished(r): if r.episode % report_episodes == 0: logging.info("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep)) logging.info("Episode reward: {}".format(r.episode_rewards[-1])) logging.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(max_episodes, max_timesteps, episode_finished=episode_finished) print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
def states(self) -> Tuple[TensorForceStateType, TensorForceStateShape]: """The state space specification, required for `tensorforce` agents. The tuple contains the following attributes: - type: Either 'bool', 'int', or 'float'. - shape: The shape of the space. An `int` or `list`/`tuple` of `int`s. """ from tensorforce.contrib.openai_gym import OpenAIGym return OpenAIGym.state_from_space(self.observation_space)
def actions(self) -> Tuple[TensorForceStateType, TensorForceStateShape, int, TensorForceMinMaxValue, TensorForceMinMaxValue]: """The action space specification, required for `tensorforce` agents. The tuple contains the following attributes: - type: Either 'bool', 'int', or 'float'. - shape: The shape of the space. An `int` or `list`/`tuple` of `int`s. - num_actions (required if type == 'int'): The number of discrete actions. - min_value (optional if type == 'float'): An `int` or `float`. Defaults to `None`. - max_value (optional if type == 'float'): An `int` or `float`. Defaults to `None`. """ from tensorforce.contrib.openai_gym import OpenAIGym return OpenAIGym.action_from_space(self.action_space)
def test_example(self): sys.stdout.write('\nQuickstart:\n') sys.stdout.flush() passed = 0 for _ in xrange(3): # Create an OpenAI-Gym environment environment = OpenAIGym('CartPole-v0') # Network specification for the model network_spec = [ dict(type='dense', size=32), dict(type='dense', size=32) ] # Create the agent agent = PPOAgent(states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, batch_size=4000, step_optimizer=dict(type='adam', learning_rate=1e-2), optimization_steps=5, discount=0.99, normalize_rewards=False, entropy_regularization=0.01, likelihood_ratio_clipping=0.2) # Initialize the runner runner = Runner(agent=agent, environment=environment) # Function handle called after each finished episode def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off mean_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or mean_reward < 50.0 # Start the runner runner.run(episodes=2000, max_episode_timesteps=200, episode_finished=episode_finished) sys.stdout.write('episodes: {}\n'.format(runner.episode)) sys.stdout.flush() # Test passed if episode_finished handle evaluated to False if runner.episode < 2000: passed += 1 sys.stdout.write('==> passed: {}\n'.format(passed)) sys.stdout.flush() self.assertTrue(passed >= 2)
def execute(self, actions): flat, hydrated, network = self.get_hypers(actions) env = OpenAIGym('CartPole-v0', visualize=True) env.viewer = None agent = agents_dict[self.agent](states_spec=env.states, actions_spec=env.actions, network_spec=network, **hydrated) # n_train, n_test = 2, 1 n_train, n_test = 250, 30 runner = Runner(agent=agent, environment=env) runner.run(episodes=n_train) # train runner.run(episodes=n_test, deterministic=True) # test # You may need to remove runner.py's close() calls so you have access to runner.episode_rewards, see # https://github.com/lefnire/tensorforce/commit/976405729abd7510d375d6aa49659f91e2d30a07 # I personally save away the results so I can play with them manually w/ scikitlearn & SQL rewards = runner.episode_rewards reward = np.mean(rewards[-n_test:]) print(flat, f"\nReward={reward}\n\n") sql = """ INSERT INTO runs (hypers, reward_avg, rewards, agent, flag) VALUES (:hypers, :reward_avg, :rewards, :agent, :flag) """ try: self.conn.execute(text(sql), hypers=json.dumps(flat), reward_avg=reward, rewards=rewards, agent='ppo_agent', flag=self.net_type) except Exception as e: pdb.set_trace() runner.close() return reward
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=256, memory=dict( type='prioritized_replay', ), update_frequency=256, first_update=512, learning_rate=0.0001, optimizer_batch_size=64, normalize_rewards=False, gae_rewards=False, baseline=dict( type="mlp", sizes=[32, 32], epochs=1, update_batch_size=64, learning_rate=0.001 ), states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def _generate_episode_data(episode_id, gym_id, model_data, versus_model_data, monitor): try: environment = OpenAIGym(gym_id=gym_id, monitor=monitor if episode_id == 0 else None, monitor_video=1 if episode_id == 0 else 0) unwrapped_gym = environment.gym.unwrapped predictor = OpponentPredictor(env=unwrapped_gym, **model_data) if versus_model_data: unwrapped_gym.set_opponent_factory(lambda: OpponentPredictor( env=unwrapped_gym, **versus_model_data)) state = environment.reset() reward = 0.0 while True: action = predictor(state) state, terminal, step_reward = environment.execute(action) reward += step_reward if terminal: break return reward except Exception as e: print('An exception occurred during game generation!', e) return 0.0
def __init__(self, rng: Union[int, np.random.RandomState, None] = None, defaults: Union[Dict, None] = None, max_episodes: Union[int, None] = 3000): """ Base benchmark for "cartpole" benchmark. In this benchmark a PPO agent tries to solve the cartpole task. Parameters ---------- rng : int,None,np.RandomState RandomState for the experiment defaults : dict, None default configuration used for the PPO agent max_episodes : int, None limit of the length of a episode for the cartpole runner. Defaults to 3000 """ logger.warning('This Benchmark is not deterministic.') super(CartpoleBase, self).__init__() self.rng = rng_helper.get_rng(rng=rng) tf.random.set_random_seed(0) np.random.seed(0) self.env = OpenAIGym('CartPole-v0', visualize=False) self.avg_n_episodes = 20 self.max_episodes = max_episodes self.defaults = { "n_units_1": 64, "n_units_2": 64, "batch_size": 64, "learning_rate": 1e-3, "discount": 0.99, "likelihood_ratio_clipping": 0.2, "activation_1": "tanh", "activation_2": "tanh", "optimizer_type": "adam", "optimization_steps": 10, "baseline_mode": "states", "baseline_n_units_1": 64, "baseline_n_units_2": 64, "baseline_learning_rate": 1e-3, "baseline_optimization_steps": 10, "baseline_optimizer_type": "adam" } if defaults is not None: self.defaults.update(defaults)
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = TRPOAgent(config=Configuration( log_level='info', batch_size=100, baseline=dict( type='mlp', size=32, hidden_layers=1, epochs=20, update_batch_size=32 ), generalized_advantage_estimation=True, normalize_advantage=False, gae_lambda=0.97, max_kl_divergence=0.005, cg_iterations=20, cg_damping=0.01, ls_max_backtracks=20, ls_override=False, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]) )) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
def experiment(args, env_name, base_agent="agent.json", agent_folder=None, visualize=True, num_episodes=1000): seasonals = (env_name=="seasonals-v1") train_env = OpenAIGym(env_name) \ if not seasonals else EnvWrap( gym.make('seasonals-v1'), batched=True, subep_len=252, num_subeps=5) test_env = OpenAIGym(env_name, monitor_video=1, monitor=os.path.join(agent_folder, "monitor")) \ if not seasonals else EnvWrap(gym.make('seasonals-v1')) agent = setup_agent(train_env.states, train_env.actions, args, save_dir=agent_folder, base_agent_file=base_agent) rewards, test_episodes, test_rewards = train( agent, train_env, num_episodes=num_episodes, test_env=train_env) train_env.close() if visualize: plot_rewards(rewards, test_episodes=test_episodes, test_rewards=test_rewards, save_dir=agent_folder) reward, history = test(agent, test_env, start_index=( test_env.first_trading_day + 252 * 5 if seasonals else None)) graph_episode(history, save_path=os.path.join(agent_folder, "test.png")) test_env.close() agent.close() experiment_data = {"final_test_reward":reward, "test_average_last_50":np.mean(test_rewards[-10:]), "train_average_last_50":np.mean(rewards[-50:]), "test_average_last_10":np.mean(test_rewards[-2:]), "train_average_last_10":np.mean(rewards[-10:]), } experiment_data.update(args) return experiment_data
def test_example(self): passed = 0 for _ in xrange(3): # Create an OpenAIgym environment env = OpenAIGym('CartPole-v0') # Create a Trust Region Policy Optimization agent agent = PPOAgent(config=Configuration( log_level='info', batch_size=4096, gae_lambda=0.97, learning_rate=0.001, entropy_penalty=0.01, epochs=5, optimizer_batch_size=512, loss_clipping=0.2, states=env.states, actions=env.actions, network=layered_network_builder([ dict(type='dense', size=32, activation='tanh'), dict(type='dense', size=32, activation='tanh') ]))) runner = Runner(agent=agent, environment=env) def episode_finished(r): # Test if mean reward over 50 should ensure that learning took off avg_reward = np.mean(r.episode_rewards[-50:]) return r.episode < 100 or avg_reward < 50.0 runner.run(episodes=2000, max_timesteps=200, episode_finished=episode_finished) if runner.episode < 2000: passed += 1 print('Quick start example passed = {}'.format(passed)) self.assertTrue(passed >= 2)
import numpy as np import time from tensorforce.agents import PPOAgent from tensorforce.execution import Runner from tensorforce.contrib.openai_gym import OpenAIGym import tensorflow as tf cluster = {'ps': ['127.0.0.1:12222'], 'worker': ['127.0.0.1:12223']} cluster_spec = tf.train.ClusterSpec(cluster) # Create an OpenAIgym environment # ReversedAddition-v0 # CartPole-v0 env = OpenAIGym('CartPole-v0', visualize=True) # Network as list of layers network_spec = [ dict(type='dense', size=32, activation='relu'), dict(type='dense', size=32, activation='relu') ] distributed_spec = dict(cluster_spec=cluster_spec, task_index=0, device=('/job:worker')) agent = PPOAgent( states_spec=env.states, actions_spec=env.actions, network_spec=network_spec,
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== import numpy as np from tensorforce.agents import PPOAgent from tensorforce.execution import Runner from tensorforce.contrib.openai_gym import OpenAIGym # Create an OpenAIgym environment. environment = OpenAIGym('CartPole-v0', visualize=False) # Network as list of layers # - Embedding layer: # - For Gym environments utilizing a discrete observation space, an # "embedding" layer should be inserted at the head of the network spec. # Such environments are usually identified by either: # - class ...Env(discrete.DiscreteEnv): # - self.observation_space = spaces.Discrete(...) # Note that depending on the following layers used, the embedding layer *may* need a # flattening layer network_spec = [ # dict(type='embedding', indices=100, size=32), # dict(type'flatten'),
# Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('drone_training') outdir = pkg_path + '/training_results' rospy.loginfo("Monitor Wrapper started") logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) environment = OpenAIGym(gym_id='QuadcopterLiveShow-v0', monitor='output', monitor_safe=False, monitor_video=False, visualize=True) print os.getcwd() with open( '/root/catkin_ws/src/drone_training/drone_training/configs/dqn_ue4.json', 'r') as fp: agent = json.load(fp=fp) with open( '/root/catkin_ws/src/drone_training/drone_training/configs/mynet.json', 'r') as fp: network = json.load(fp=fp) agent = Agent.from_spec(spec=agent, kwargs=dict(
# limitations under the License. # ============================================================================== import numpy as np from tensorforce.agents import RandomAgent from tensorforce.execution import Runner from tensorforce.contrib.openai_gym import OpenAIGym import sc2gym from absl import flags FLAGS = flags.FLAGS FLAGS([__file__]) # Create an OpenAIgym environment env = OpenAIGym('SC2CollectMineralShards-v2', visualize=False) agent = RandomAgent( states_spec=env.states, actions_spec=env.actions, ) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics rewards = [] def episode_finished(r): print( "Finished episode {ep} after {ts} timesteps (reward: {reward})".format(
import numpy as np import json from tensorforce.agents import Agent from tensorforce.execution import Runner from tensorforce.contrib.openai_gym import OpenAIGym # Create an OpenAIgym environment env = OpenAIGym('Pendulum-v0', visualize=False) network_path = './pendulum_ppo_network.json' agent_path = './pendulum_ppo.json' with open(network_path, 'r') as fp: network_spec = json.load(fp=fp) with open(agent_path, 'r') as fp: agent_config = json.load(fp=fp) agent = Agent.from_spec(spec=agent_config, kwargs=dict(states=env.states, actions=env.actions, network=network_spec)) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r): print( "Finished episode {ep} after {ts} timesteps (reward: {reward})".format( ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) return True
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-w', '--num-workers', type=int, default=1, help="Number of worker agents") parser.add_argument('-m', '--monitor', help="Save results to this file") parser.add_argument('-M', '--mode', choices=['tmux', 'child'], default='tmux', help="Starter mode") parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") parser.add_argument('-C', '--is-child', action='store_true') parser.add_argument('-i', '--task-index', type=int, default=0, help="Task index") parser.add_argument('-K', '--kill', action='store_true', default=False, help="Kill runners") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() session_name = 'openai_async' shell = '/bin/bash' kill_cmds = [ "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format( 12222 + args.num_workers), "tmux kill-session -t {}".format(session_name), ] if args.kill: os.system("\n".join(kill_cmds)) return 0 if not args.is_child: # start up child processes target_script = os.path.abspath(inspect.stack()[0][1]) def wrap_cmd(session, name, cmd): if isinstance(cmd, list): cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) if args.mode == 'tmux': return 'tmux send-keys -t {}:{} {} Enter'.format( session, name, shlex_quote(cmd)) elif args.mode == 'child': return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( cmd, args.logdir, session, name, args.logdir) def build_cmd(index): cmd_args = [ 'CUDA_VISIBLE_DEVICES=', sys.executable, target_script, args.gym_id, '--is-child', '--agent', args.agent, '--agent-config', os.path.join(os.getcwd(), args.agent_config), '--network-config', os.path.join(os.getcwd(), args.network_config), '--num-workers', args.num_workers, '--task-index', index ] if args.debug: cmd_args.append('--debug') return cmd_args if args.mode == 'tmux': cmds = kill_cmds + [ 'tmux new-session -d -s {} -n ps'.format(session_name) ] elif args.mode == 'child': cmds = [ 'mkdir -p {}'.format(args.logdir), 'rm -f {}/kill.sh'.format(args.logdir), 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), 'chmod +x {}/kill.sh'.format(args.logdir) ] cmds.append(wrap_cmd(session_name, 'ps', build_cmd(-1))) for i in xrange(args.num_workers): name = 'w_{}'.format(i) if args.mode == 'tmux': cmds.append('tmux new-window -t {} -n {} -d {}'.format( session_name, name, shell)) cmds.append(wrap_cmd(session_name, name, build_cmd(i))) # add one PS call # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) print("\n".join(cmds)) os.system("\n".join(cmds)) return 0 ps_hosts = ['127.0.0.1:{}'.format(12222)] worker_hosts = [] port = 12223 for _ in range(args.num_workers): worker_hosts.append('127.0.0.1:{}'.format(port)) port += 1 cluster = {'ps': ps_hosts, 'worker': worker_hosts} cluster_spec = tf.train.ClusterSpec(cluster) environment = OpenAIGym(args.gym_id) if args.agent_config: agent_config = Configuration.from_json(args.agent_config) else: raise TensorForceError("No agent configuration provided.") if not args.network_config: raise TensorForceError("No network configuration provided.") agent_config.default( dict(states=environment.states, actions=environment.actions, network=from_json(args.network_config))) agent_config.default( dict(distributed=True, cluster_spec=cluster_spec, global_model=(args.task_index == -1), device=('/job:ps' if args.task_index == -1 else '/job:worker/task:{}/cpu:0'.format(args.task_index)))) logger = logging.getLogger(__name__) logger.setLevel(log_levels[agent_config.log_level]) agent = agents[args.agent](config=agent_config) logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format( gym_id=args.gym_id)) logger.info("Config:") logger.info(agent_config) runner = Runner(agent=agent, environment=environment, repeat_actions=1, cluster_spec=cluster_spec, task_index=args.task_index) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: sps = r.total_timesteps / (time.time() - r.start_time) logger.info( "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}" .format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished)
import numpy as np from tensorforce.agents import PPOAgent, RandomAgent from tensorforce.execution import Runner from tensorforce.contrib.openai_gym import OpenAIGym # Create an OpenAIgym environment # ReversedAddition-v0 # CartPole-v0 env = OpenAIGym('ReversedAddition-v0', visualize=False) print(env.gym.observation_space) print(env.gym.action_space) # Network as list of layers network_spec = [ dict(type='embedding', size=32, indices=100), dict(type='dense', size=32), dict(type='dense', size=32) ] agent = PPOAgent( states_spec=env.states, actions_spec=env.actions, network_spec=network_spec, batch_size=4096, # Agent preprocessing=None, exploration=None, reward_preprocessing=None, # BatchAgent
# set the network layout network_spec = [ dict(type='dense', size=64), dict(type='dense', size=32), dict(type='dense', size=32) ] for memory_type in memory_types: #create filename fn = 'Acrobot_10k_' + str(model_type) + '_' + str(memory_type) + '.pkl' print(fn) d1 = datetime.datetime.now() # set the breakout atari environment environment = OpenAIGym('Acrobot-v1', visualize=False) #define the memory and model types memory = define_memory(memory_type) double_model = define_model(model_type) # create the agent agent = create_agent(memory, double_model, environment) # create the runner runner = Runner(agent=agent, environment=environment) # teach the agent runner.run(episodes=10000, episode_finished=episode_finished) runner.close() # Print statistics print( "Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}." .format(ep=runner.episode, ar=np.mean(runner.episode_rewards[-100:]))) # print time taken
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', help="Choose actions deterministically") parser.add_argument('-M', '--mode', choices=('tmux', 'child'), default='tmux', help="Starter mode") parser.add_argument('-W', '--num-workers', type=int, default=1, help="Number of worker agents") parser.add_argument('-C', '--child', action='store_true', help="Child process") parser.add_argument('-P', '--parameter-server', action='store_true', help="Parameter server") parser.add_argument('-I', '--task-index', type=int, default=0, help="Task index") parser.add_argument('-K', '--kill', action='store_true', help="Kill runners") parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") parser.add_argument('-D', '--debug', action='store_true', help="Show debug outputs") args = parser.parse_args() session_name = 'OpenAI-' + args.gym_id shell = '/bin/bash' kill_cmds = [ "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format( 12222 + args.num_workers), "tmux kill-session -t {}".format(session_name), ] if args.kill: os.system("\n".join(kill_cmds)) return 0 if not args.child: # start up child processes target_script = os.path.abspath(inspect.stack()[0][1]) def wrap_cmd(session, name, cmd): if isinstance(cmd, list): cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) if args.mode == 'tmux': return 'tmux send-keys -t {}:{} {} Enter'.format( session, name, shlex_quote(cmd)) elif args.mode == 'child': return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( cmd, args.logdir, session, name, args.logdir) def build_cmd(ps, index): cmd_args = [ 'CUDA_VISIBLE_DEVICES=', sys.executable, target_script, args.gym_id, '--agent', os.path.join(os.getcwd(), args.agent), '--network', os.path.join(os.getcwd(), args.network), '--num-workers', args.num_workers, '--child', '--task-index', index ] if args.episodes is not None: cmd_args.append('--episodes') cmd_args.append(args.episodes) if args.timesteps is not None: cmd_args.append('--timesteps') cmd_args.append(args.timesteps) if args.max_episode_timesteps is not None: cmd_args.append('--max-episode-timesteps') cmd_args.append(args.max_episode_timesteps) if args.deterministic: cmd_args.append('--deterministic') if ps: cmd_args.append('--parameter-server') if args.debug: cmd_args.append('--debug') return cmd_args if args.mode == 'tmux': cmds = kill_cmds + [ 'tmux new-session -d -s {} -n ps'.format(session_name) ] elif args.mode == 'child': cmds = [ 'mkdir -p {}'.format(args.logdir), 'rm -f {}/kill.sh'.format(args.logdir), 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), 'chmod +x {}/kill.sh'.format(args.logdir) ] cmds.append(wrap_cmd(session_name, 'ps', build_cmd(ps=True, index=0))) for i in xrange(args.num_workers): name = 'worker{}'.format(i) if args.mode == 'tmux': cmds.append('tmux new-window -t {} -n {} -d {}'.format( session_name, name, shell)) cmds.append( wrap_cmd(session_name, name, build_cmd(ps=False, index=i))) # add one PS call # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) print("\n".join(cmds)) os.system("\n".join(cmds)) return 0 ps_hosts = ['127.0.0.1:{}'.format(12222)] worker_hosts = [] port = 12223 for _ in range(args.num_workers): worker_hosts.append('127.0.0.1:{}'.format(port)) port += 1 cluster = {'ps': ps_hosts, 'worker': worker_hosts} cluster_spec = tf.train.ClusterSpec(cluster) environment = OpenAIGym(args.gym_id) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # log_levels[agent.log_level]) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") if args.parameter_server: agent['device'] = '/job:ps/task:{}'.format(args.task_index) # '/cpu:0' else: agent['device'] = '/job:worker/task:{}'.format( args.task_index) # '/cpu:0' agent['execution'] = dict( type='distributed', distributed_spec=dict(cluster_spec=cluster_spec, task_index=args.task_index, job='ps' if args.parameter_server else 'worker', protocol='grpc')) agent = Agent.from_spec(spec=agent, kwargs=dict(states=environment.states, actions=environment.actions, network=network)) logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format( gym_id=args.gym_id)) logger.info("Config:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 def episode_finished(r): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {} after overall {} timesteps. Steps Per Second {}" .format(r.agent.episode, r.agent.timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True runner.run(timesteps=args.timesteps, episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) runner.close()
import numpy as np import time import matplotlib.pyplot as plt from tensorforce.agents import PPOAgent from tensorforce.execution import Runner from tensorforce.contrib.openai_gym import OpenAIGym env = OpenAIGym('MountainCar-v0', visualize=False) network_spec = [ dict(type='dense', size=16, activation='relu'), dict(type='dense', size=16, activation='relu'), dict(type='dense', size=16, activation='relu') ] agent = PPOAgent( states_spec=env.states, actions_spec=env.actions, network_spec=network_spec, batch_size=1024, # Agent # preprocessing=None, # exploration=None, # reward_preprocessing=None, # BatchAgent keep_last_timestep=True, # PPOAgent step_optimizer=dict( type='adam', learning_rate=1e-3
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-i', '--import-modules', help="Import module(s) required for environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('--monitor', help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") parser.add_argument('-te', '--test', action='store_true', default=False, help="Test agent without learning.") parser.add_argument( '-sl', '--sleep', type=float, default=None, help= "Slow down simulation by sleeping for x seconds (fractions allowed).") parser.add_argument( '--job', type=str, default=None, help="For distributed mode: The job type of this agent.") parser.add_argument( '--task', type=int, default=0, help="For distributed mode: The task index of this agent.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) if args.import_modules is not None: for module in args.import_modules.split(','): importlib.import_module(name=module) environment = OpenAIGym(gym_id=args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video, visualize=args.visualize) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent, kwargs=dict( states=environment.states, actions=environment.actions, network=network, )) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}" .format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) if args.save and args.save_episodes is not None and not r.episode % args.save_episodes: logger.info("Saving agent to {}".format(args.save)) r.agent.save_model(args.save) return True runner.run(num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished, testing=args.test, sleep=args.sleep) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', help='Agent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('--monitor', help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) environment = OpenAIGym(gym_id=args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video) if args.agent_config: config = Configuration.from_json(args.agent_config) else: config = Configuration() logger.info("No agent configuration provided.") if args.network_spec: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=args.agent, kwargs=dict(states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec, config=config)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(config) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: report per timestep? report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {} after {} timesteps. Steps Per Second {}". format(r.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True runner.run(timesteps=args.timesteps, episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) logger.info( "Learning finished. Total episodes: {ep}".format(ep=runner.episode))
print("NOT ENOUGH LOGGING INFO") print("Please write more about the changes and reasoning.") exit() with open(f"{TB_path}/README/README.txt", "w") as readme: start_time_ascii = time.asctime(time.localtime(time.time())) algorithm = os.path.basename(__file__)[:-2] print(f"Experiment start time: {start_time_ascii}", file=readme) print(f"\nAlgorithm:\n{algorithm}", file=readme) print(f"\nThe Changes:\n{changes}", file=readme) print(f"\nReasoning:\n{reasoning}", file=readme) print(f"\nHypothesis:\n{hypothesis}", file=readme) print(f"\nResults:\n", file=readme) # Create an OpenAIgym environment. environment = OpenAIGym('BizHawk-v0', visualize=False) environment.gym.logging_folder_path = TB_path # Network as list of layers # - Embedding layer: # - For Gym environments utilizing a discrete observation space, an # "embedding" layer should be inserted at the head of the network spec. # Such environments are usually identified by either: # - class ...Env(discrete.DiscreteEnv): # - self.observation_space = spaces.Discrete(...) # Note that depending on the following layers used, the embedding layer *may* need a # flattening layer # BREADCRUMBS_START network_spec = [
import numpy as np import json from tensorforce.agents import Agent from tensorforce.execution import Runner from tensorforce.contrib.openai_gym import OpenAIGym # Create an OpenAIgym environment env = OpenAIGym('MountainCar-v0', visualize=False) network_path = './mountain_car_ppo_network.json' agent_path = './mountain_car_ppo.json' with open(network_path, 'r') as fp: network_spec = json.load(fp=fp) with open(agent_path, 'r') as fp: agent_config = json.load(fp=fp) agent = Agent.from_spec( spec=agent_config, kwargs=dict( states=env.states, actions=env.actions, network=network_spec ) ) # Create the runner runner = Runner(agent=agent, environment=env) # Callback function printing episode statistics def episode_finished(r):