def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000 * 60, help="Maximum number of timesteps per episode") # parser.add_argument('-m', '--monitor', help="Save results to this directory") # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) environment = OpenAIUniverse(args.gym_id) environment.configure(remotes=1) if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent_config, kwargs=dict(states=environment.states, actions=environment.actions, network=network_spec)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) runner = Runner(agent=agent, environment=environment, repeat_actions=1) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {} after {} timesteps. Steps Per Second {}". format(r.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) runner.close() logger.info( "Learning finished. Total episodes: {ep}".format(ep=runner.episode)) # if args.monitor: # environment.gym.monitor.close() environment.close()
global iter global modelSaves plt.pause(0.01) if (iter == 30): iter = 0 agent.save_model('longlongNoNorma/dense_mix') modelSaves = modelSaves + 1 else: iter = iter + 1 return True # Start learning runner.run(episodes=7000, max_episode_timesteps=(candles.candle_nums + 100), episode_finished=episode_finished) #runner.run(episodes=1, max_episode_timesteps=(candles.candle_nums + 100), episode_finished=episode_finished, deterministic=True) # Print statistics print( "Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}." .format(ep=runner.episode, ar=np.mean(runner.episode_rewards[-100:]))) print(env.pair_currency) print(env.base_currency) runner.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('--monitor', help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") parser.add_argument( '--job', type=str, default=None, help="For distributed mode: The job type of this agent.") parser.add_argument( '--task', type=int, default=0, help="For distributed mode: The task index of this agent.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) environment = OpenAIGym(gym_id=args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video, visualize=args.visualize) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") # TEST agent["execution"] = dict( type="distributed", distributed_spec=dict( job=args.job, task_index=args.task, # parameter_server=(args.job == "ps"), cluster_spec=dict(ps=["192.168.2.107:22222"], worker=["192.168.2.107:22223" ]))) if args.job else None # END: TEST agent = Agent.from_spec(spec=agent, kwargs=dict( states=environment.states, actions=environment.actions, network=network, )) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}" .format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) if args.save and args.save_episodes is not None and not r.episode % args.save_episodes: logger.info("Saving agent to {}".format(args.save)) r.agent.save_model(args.save) return True runner.run(num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
def main(): args = make_args_parser() # print_config(args) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler(sys.stdout)) # # Temporary for quick access # args.groups = 1 # args.run_all = False # # args.episodes = 501 # args.save_episodes = 3500 # args.testing = False # args.target_group = 7 # args.restore_agent = True # # args.save_agent =./saved_model/" + path # input_path = "./saved_model/" + "V3/group7-1000/" # path = "V3/group7-1500/" # output_path = "./outputs/" + path # ~~~~~~~~~~~~~~~~~ Setting up the Model ~~~~~~~~~~~~~~~~~ # # Initialize environment (tensorforce's template) memory = {} environment = ReJoin( args.phase, args.query, args.episodes, args.groups, memory, args.mode, args.target_group, args.run_all ) if args.agent_config is not None: with open(args.agent_config, "r") as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, "r") as fp: network_spec = json.load(fp=fp) else: raise TensorForceError("No network configuration provided.") # Set up the PPO Agent agent = Agent.from_spec( spec=agent_config, kwargs=dict( states=environment.states, actions=environment.actions, network=network_spec, variable_noise=0.5 ), ) if args.restore_agent != "": agent.restore_model(directory=args.restore_agent) runner = Runner(agent=agent, environment=environment) # ~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~ # report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: if args.save_agent != "" and args.testing is False and r.episode == args.save_episodes: save_dir = os.path.dirname(args.save_agent) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) r.agent.save_model( directory=args.save_agent, append_timestep=True ) logger.info( "Episode {ep} reward: {r}".format(ep=r.episode, r=r.episode_rewards[-1]) ) logger.info( "Average of last 100 rewards: {}\n".format( sum(r.episode_rewards[-100:]) / 100 ) ) return True logger.info( "Starting {agent} for Environment '{env}'".format(agent=agent, env=environment) ) # Start training or testing runner.run( episodes=args.episodes, max_episode_timesteps=args.max_timesteps, episode_finished=episode_finished, deterministic=args.testing, ) runner.close() logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) def find_convergence(eps): last = eps[-1] for i in range(1, len(eps)): if eps[i * -1] != last: print("Converged at episode:", len(eps) - i + 2) return True find_convergence(runner.episode_rewards) # plt.figure(1) # plt.hist(runner.episode_rewards) # # plt.figure(2) # plt.plot(runner.episode_rewards, "b.", MarkerSize=2) if not os.path.exists(args.outputs): os.makedirs(args.outputs) # Plot recorded costs over all episodes # print(memory) i = 2 for file, val in memory.items(): i += 1 plt.figure(i) postgres_estimate = val["postgres_cost"] costs = np.array(val["costs"]) max_val = max(costs) min_val = min(costs) plt.xlabel("episode") plt.ylabel("cost") plt.title(file) plt.scatter( np.arange(len(costs)), costs, c="g", alpha=0.5, marker=r"$\ast$", label="Cost", ) plt.legend(loc="upper right") plt.scatter( 0, [min_val], c="r", alpha=1, marker=r"$\heartsuit$", s=200, label="min cost observed=" + str(min_val), ) plt.scatter( 0, [max_val], c="b", alpha=1, marker=r"$\times$", s=200, label="max cost observed=" + str(max_val), ) plt.legend(loc="upper right") plt.scatter( 0, [postgres_estimate], c="c", alpha=1, marker=r"$\star$", s=200, label="postgreSQL estimate=" + str(postgres_estimate), ) plt.legend(loc="upper right") plt.savefig(args.outputs + file + ".png") plt.show(block=True)
def main(): parser = argparse.ArgumentParser(description='Playground Flags.') parser.add_argument('--game', default='pommerman', help='Game to choose.') parser.add_argument('--config', default='ffa_v0', help='Configuration to execute.') parser.add_argument('--agents', default='tensorforce::ppo,test::agents.SimpleAgent,test::agents.SimpleAgent,test::agents.SimpleAgent', help='Comma delineated list of agent types and docker locations to run the agents.') parser.add_argument('--record_dir', help="Directory to record the PNGs of the game. Doesn't record if None.") args = parser.parse_args() config = utility.AttrDict(getattr(configs, args.config)()) _agents = [] for agent_id, agent_info in enumerate(args.agents.split(",")): agent = config.agent(agent_id, config.game_type) agent_type, agent_control = agent_info.split("::") assert agent_type in ["player", "random", "docker", "test", "tensorforce"] if agent_type == "player": assert agent_control in ["arrows"] on_key_press, on_key_release = utility.get_key_control(agent_control) agent = agents.PlayerAgent( agent, utility.KEY_INPUT, on_key_press=on_key_press, on_key_release=on_key_release) elif agent_type == "random": agent = agents.RandomAgent(agent) elif agent_type == "docker": agent = agents.DockerAgent( agent, docker_image=agent_control, docker_client=client, port=agent_id+1000) elif agent_type == "test": agent = eval(agent_control)(agent) elif agent_type == "tensorforce": agent = agents.TensorForceAgent(agent, algorithm=agent_control) training_agent = agent _agents.append(agent) gym.envs.registration.register( id=config.env_id, entry_point=config.env_entry_point, kwargs=config.env_kwargs ) env = config.env(**config.env_kwargs) env.set_agents(_agents) env.set_training_agent(training_agent.agent_id) env.seed(0) # Create a Proximal Policy Optimization agent agent = training_agent.initialize(env) atexit.register(functools.partial(clean_up_agents, _agents)) wrapped_env = WrappedEnv(env, visualize=True) runner = Runner(agent=agent, environment=wrapped_env) runner.run(episodes=10, max_episode_timesteps=2000) print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times) try: runner.close() except AttributeError as e: pass
def main(): logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) console_handler = logging.StreamHandler() console_handler.setFormatter( logging.Formatter( "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s" )) logger.addHandler(console_handler) parser = argparse.ArgumentParser() parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") args = parser.parse_args() if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") if network_spec[0]['type'] == 'conv2d': agent_config['states_preprocessing'] = [{ 'type': 'expand_dims', 'axis': -1 }] else: agent_config['states_preprocessing'] = [{'type': 'flatten'}] logger.info("Start training") environment = Game2048() agent = Agent.from_spec(spec=agent_config, kwargs=dict( states=environment.states, actions=environment.actions, network=network_spec, )) runner = Runner(agent=agent, environment=environment, repeat_actions=1) def episode_finished(r): if r.episode % 100 == 0: sps = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}" .format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Episode timesteps: {}".format(r.episode_timestep)) logger.info("Episode largest tile: {}".format( r.environment.largest_tile)) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True runner.run(timesteps=6000000, episodes=1000, max_episode_timesteps=10000, deterministic=False, episode_finished=episode_finished) terminal = False state = environment.reset() while not terminal: action = agent.act(state) state, terminal, reward = environment.execute(action) environment.print_state() runner.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-i', '--import-modules', help="Import module(s) required for environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('--monitor', help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") parser.add_argument('-te', '--test', action='store_true', default=False, help="Test agent without learning.") parser.add_argument( '-sl', '--sleep', type=float, default=None, help= "Slow down simulation by sleeping for x seconds (fractions allowed).") parser.add_argument( '--job', type=str, default=None, help="For distributed mode: The job type of this agent.") parser.add_argument( '--task', type=int, default=0, help="For distributed mode: The task index of this agent.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger() logger.setLevel(logging.INFO) if args.import_modules is not None: for module in args.import_modules.split(','): importlib.import_module(name=module) environment = OpenAIGym(gym_id=args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video, visualize=args.visualize) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) agent = Agent.from_spec(spec=agent, kwargs=dict(states=environment.states, actions=environment.actions, network=network)) else: logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent, kwargs=dict(states=environment.states, actions=environment.actions)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}" .format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) if args.save and args.save_episodes is not None and not r.episode % args.save_episodes: logger.info("Saving agent to {}".format(args.save)) r.agent.save_model(args.save) return True runner.run(num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished, testing=args.test, sleep=args.sleep) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gym_id', default='CartPole-v0', help="Name of the OpenAI Gym Environment") parser.add_argument('-a', '--agent', type=str, default='PPO', help="Agent to train.") parser.add_argument('-e', '--episodes', type=int, default=20, help="Number of episodes to train for.") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps to train for.") parser.add_argument( '-nv', '--novisualize', action='store_false', default=True, help="Don't visualize training (will speed up training)") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument( '-d', '--deterministic', action='store_true', default=False, help="Choose deterministically and don't use random actions.") parser.add_argument( '-l', '--load', help="Load pretrained agent from this particular directory.") parser.add_argument( '-nm', '--num-episodes-to-test', type=int, default=10, help="Number of episodes to test the loaded policy for.") parser.add_argument('-x', '--exp', type=str, default='exp_test_delete_this', help="Name of experiment for logging/saving weights.") parser.add_argument('--monitor', default='./logs/', help="Save results and logs to this directory.") parser.add_argument('--save', default='./weights/', help="Save trained model to this directory.") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results.") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled).") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs.") args = parser.parse_args() # Load the required agent from custom module logger.info('Loading {} Agent/Network'.format(args.agent)) if args.agent.lower() == 'ddpg': from modules.custom_agents import DDPG_Agent_Network agent, network = DDPG_Agent_Network() elif args.agent.lower() == 'naf': from modules.custom_agents import NAF_Agent_Network agent, network = NAF_Agent_Network() elif args.agent.lower() == 'trpo': from modules.custom_agents import TRPO_Agent_Network agent, network = TRPO_Agent_Network() elif args.agent.lower() == 'ppo': from modules.custom_agents import PPO_Agent_Network agent, network = PPO_Agent_Network() elif args.agent.lower() == 'vpg': from modules.custom_agents import VPG_Agent_Network agent, network = VPG_Agent_Network() logfilepath = os.path.join(args.monitor, args.agent, args.exp) if not args.load: logger.info('Creating logging folder {}'.format(logfilepath)) os.system('mkdir -p {}'.format(logfilepath)) env = OpenAIGym(gym_id=args.gym_id, monitor=logfilepath, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video, visualize=args.novisualize) agent = Agent.from_spec(spec=agent, kwargs=dict( states=env.states, actions=env.actions, network=network, )) if args.load: logger.info("Testing pre-trained model!") load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) logger.info('Loaded pre-trained model weights!') logger.info('Starting testing process!') env = gym.make(args.gym_id) for _i in range(args.num_episodes_to_test): logger.info('Episode: {}'.format(_i)) s = env.reset() done = False while not done: env.render() action = agent.act(s) s, r, done, _ = env.step(action) # TODO: Make a logger here similar to episode_end() return def episode_finished(r, id): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}" .format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner(agent=agent, environment=env, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 1 logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode)) filepath = os.path.join(args.save, args.agent, args.exp) logger.info('Creating directory {}'.format(filepath)) os.system('mkdir -p {}'.format(filepath)) # recursive mkdir logger.info("Saving trained model to {}!".format(filepath)) filepath = agent.save_model(os.path.join(filepath, 'model'), append_timestep=False) logger.info("Saved trained model as: {}".format(filepath)) runner.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-P', '--port', default=6025, help= "Port on which the UE4 Game listens on for incoming RL-client connections" ) parser.add_argument('-H', '--host', default=None, help="Hostname of the UE4 Game (default: localhost)") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") parser.add_argument('-R', '--random-test-run', action="store_true", help="Do a quick random test run on the env") args = parser.parse_args() # logging.basicConfig(filename="logfile.txt", level=logging.INFO) logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # We have to connect this remote env to get the specs. # We also discretize axis-mappings b/c we will use a deep q-network. # Use num_ticks==6 to match Nature paper by Mnih et al. # ("human cannot press fire button with more than 10Hz", dt=1/60) # TODO: Need to build in capturing and concat'ing last 4 images (plus 8-bit conversion!) into 1 input state signal. # TODO: Use pre-processor for that. environment = UE4Environment(host=args.host, port=args.port, connect=True, discretize_actions=True, num_ticks=6) environment.seed(200) # Do a quick random test-run with image capture of the first n images -> then exit after 1000 steps. if args.random_test_run: # Reset the env. s = environment.reset() img_format = "RGB" if len(environment.states["shape"]) == 3 else "L" img = Image.fromarray(s, img_format) # Save first received image as a sanity-check. img.save("reset.png") for i in range(1000): s, is_terminal, r = environment.execute(action=random.choice( range(environment.actions["num_actions"]))) if i < 10: img = Image.fromarray(s, img_format) img.save("{:03d}.png".format(i)) logging.debug("i={} r={} term={}".format(i, r, is_terminal)) if is_terminal: environment.reset() quit() if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent_config, kwargs=dict(states=environment.states, actions=environment.actions, network=network_spec)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.global_timestep / (time.time() - r.start_time) logger.info( "Finished episode {} after {} timesteps. SPS={}".format( r.global_episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True runner.run(timesteps=args.timesteps, episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
class Ezpeezy(): """ This class is used to encompass all user behavior and interactions. ... Attributes ---------- _env : tensorforce.Environment custom environment used to define hyperparameter space and reward functions _agent : tensorforce.Agent dqn agent used to optimize the reward function defined in the environment _runner : tensorforce.Runner used to handle the training job of the agent Methods ------- set_k_folds(n_folds, pick_random=None) Specifies to the environment what sort of cross-validation data configuration to use. train_on_data(X_train, y_train, X_valid=None, y_valid=None) Specifies to the environment what data to use for training. get_history() Returns the history of the agent including the configurations it has already tested. run(num_episodes) Begins using the agent to discover the actions required to optimize the environment's reward. """ def __init__(self, config, model_fn, model_type='sklearn', model_train_batch_size=256, model_train_epochs=75, exploration=0.9, exploration_decay_rate=0.8, monitor_metric='val_loss', opt='max', starting_tol=-0.01, tol_decay=0.5, deepqn_lr=1e-15): """ Parameters ---------- config : dict a dictionary representing the configuration of the hyperparameter space. keys represent the name of the hyperparameter while keys can represent ranges of the parameter space and its type model_fn : function function that returns the model you want to optimize model_type : string "sklearn" to signify that the passed in model_fn is of the sklearn library, or "keras" to signify that the passed in model_fn is made from the keras library model_train_batch_size : int the batch size to use when training your model model_train_epochs : int number of eopchs to train your model for on each iteration exploration : float the agent's exploration value exploration_decay_rate : float the agent's exploration value's decay rate (uses exponential decay) monitor_metric : None or string or function the metric you would like to optimize in your model - string in the case of model_type == 'keras', function if model_type == 'sklearn' or None if to use the .score(X, y) function of the sklearn clasasifier if function, defined to take in y_true, y_pred and return numeric type opt : string the optimization direction of the given monitor_metric starting_tol : int/float the value that you would like to see your metric to increase by at each training step, or else end the agent's episode tol_decay : int/float at each training step in the episode, decrease the tolerance by this value deepqn_lr : float learning rate to use for the DQN """ self._env = CustomEnvironment( config, model_train_epoch=model_train_epochs, model_train_batch_size=model_train_batch_size, model_fn=model_fn, model_type=model_type, monitor_metric=monitor_metric, opt=opt, starting_tol=starting_tol, tol_decay=tol_decay) self._agent = DeepQNetwork( states=self._env.states(), actions=self._env.actions(), max_episode_timesteps=self._env.max_episode_timesteps(), memory=60, batch_size=3, exploration=dict(type='decaying', unit='timesteps', decay='exponential', initial_value=exploration, decay_steps=100000, decay_rate=exploration_decay_rate), discount=dict(type='decaying', unit='timesteps', decay='exponential', initial_value=0.7, decay_steps=100000, decay_rate=0.5), learning_rate=deepqn_lr) self.runner = Runner(agent=self._agent, environment=self._env) def set_k_folds(self, n_folds, pick_random=None): """ Specifies to the environment what sort of cross-validation data configuration to use. Parameters ---------- n_folds : int the number of folds to divide your dataset into using k-fold cross-validation pick_random : int/None if set to an int, randomly select pick_random of the n_folds to use for training your model """ assert isinstance(n_folds, int), 'n_folds must be an int' assert (isinstance(pick_random, int) & (pick_random < n_folds)) | (pick_random == None) , \ "pick random must be an int less than n_folds or None" self._env.set_k_folds(n_folds, pick_random) def train_on_data(self, X_train, y_train, X_valid=None, y_valid=None): """ Specifies to the environment what data to use for training. Parameters ---------- X_train : iterible data used to train your model y_train : iterable labels used to train your model X_valid : iterable/None data used to validate your model unless using k-fold CV y_valid : iterable/None labels used to validate your model unless using k-fold CV """ self._env.train_on_data(X_train, y_train, X_valid, y_valid) def get_history(self): """ Returns the history of the agent including the configurations it has already tested. Returns ------- pd.Dataframe Dataframe representing each absolute time step with its episode, configuration and monitored metric """ return self._env.get_history() def run(self, num_episodes): """ Begins using the agent to discover the actions required to optimize the environment's reward. Parameters ---------- num_episodes : int number of episodes to try your agent for on your environment Prints ------ the best parameters for your goal. """ self._env.reset_history() self._env.set_num_episodes(num_episodes) self.runner.run(num_episodes=num_episodes) print('Best parameters are:') print(self._env.get_best_params()) self.runner.close()
def main(): # Print all possible environments in the Pommerman registry # Instantiate the environment DETERMINISTIC = False VISUALIZE = False if args.test: DETERMINISTIC = True VISUALIZE = True config = ffa_competition_env() env = Pomme(**config["env_kwargs"]) env.seed(0) # Create a Proximal Policy Optimization agent with open('ppo.json', 'r') as fp: agent = json.load(fp=fp) with open('mlp2_lstm_network.json', 'r') as fp: network = json.load(fp=fp) agent = Agent.from_spec( spec=agent, kwargs=dict( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=network ) ) # Add 3 random agents agents = [] for agent_id in range(3): agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"]))) # Add TensorforceAgent agent_id += 1 agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) # Instantiate and run the environment for 5 episodes. if VISUALIZE: wrapped_env = WrappedEnv(env, True) else: wrapped_env = WrappedEnv(env) runner = Runner(agent=agent, environment=wrapped_env) rewards = [] episodes = [] def episode_finished(r): nonlocal episodes nonlocal rewards print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) if r.episode % 1000 == 0: agent.save_model(('./{}').format(EXPERIMENT_NAME), False) try: prev_data = pickle.load(open(EXPERIMENT_NAME, "rb")) prev_len = len(prev_data[0]) prev_data[0].extend(rewards) rewards = [] prev_data[1].extend(episodes) episodes = [] pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb")) except (OSError, IOError) as e: pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb")) if r.episode_rewards[-1] >= 5: print() print() print() print("WINNER WINNER CHICKEN DINNER") episodes.append(r.episode) rewards.append(r.episode_rewards[-1]) return True # Restore, Train, and Save Model if args.test or args.resume: # If test, change settings and restore model agent.restore_model('./','PPO_K_someS_500batch_biggerreward_99dis') runner.run(episodes=EPISODES, max_episode_timesteps=2000, episode_finished=episode_finished, deterministic=False) if not args.test: agent.save_model(('./{}').format(EXPERIMENT_NAME), False) print("Stats: ", runner.episode_rewards[-5:], runner.episode_timesteps[-5:]) #Dump reward values try: prev_data = pickle.load(open(EXPERIMENT_NAME, "rb")) prev_len = len(prev_data[0]) prev_data[0].extend(rewards) prev_data[1].extend(episodes) print(episodes) pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb")) except (OSError, IOError) as e: pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb")) try: runner.close() except AttributeError as e: pass
def main(): parser = argparse.ArgumentParser(description="Playground Flags.") parser.add_argument("--game", default="pommerman", help="Game to choose.") parser.add_argument("--config", default="PommeFFA-v0", help="Configuration to execute. See env_ids in " "configs.py for options.") parser.add_argument("--agents", default="tensorforce::ppo,test::agents.SimpleAgent," "test::agents.SimpleAgent,test::agents.SimpleAgent", help="Comma delineated list of agent types and docker " "locations to run the agents.") parser.add_argument("--agent_env_vars", help="Comma delineated list of agent environment vars " "to pass to Docker. This is only for the Docker Agent." " An example is '0:foo=bar:baz=lar,3:foo=lam', which " "would send two arguments to Docker Agent 0 and one to" " Docker Agent 3.", default="") parser.add_argument("--record_pngs_dir", default=None, help="Directory to record the PNGs of the game. " "Doesn't record if None.") parser.add_argument("--record_json_dir", default=None, help="Directory to record the JSON representations of " "the game. Doesn't record if None.") parser.add_argument("--render", default=True, help="Whether to render or not. Defaults to True.") parser.add_argument("--game_state_file", default=None, help="File from which to load game state. Defaults to " "None.") args = parser.parse_args() config = args.config record_pngs_dir = args.record_pngs_dir record_json_dir = args.record_json_dir agent_env_vars = args.agent_env_vars game_state_file = args.game_state_file # TODO: After https://github.com/MultiAgentLearning/playground/pull/40 # this is still missing the docker_env_dict parsing for the agents. agents = [ helpers.make_agent_from_string(agent_string, agent_id+1000) for agent_id, agent_string in enumerate(args.agents.split(",")) ] env = make(config, agents, game_state_file) training_agent = None for agent in agents: if type(agent) == TensorForceAgent: training_agent = agent env.set_training_agent(agent.agent_id) break if args.record_pngs_dir: assert not os.path.isdir(args.record_pngs_dir) os.makedirs(args.record_pngs_dir) if args.record_json_dir: assert not os.path.isdir(args.record_json_dir) os.makedirs(args.record_json_dir) # Create a Proximal Policy Optimization agent agent = training_agent.initialize(env) atexit.register(functools.partial(clean_up_agents, agents)) wrapped_env = WrappedEnv(env, visualize=args.render) runner = Runner(agent=agent, environment=wrapped_env) runner.run(episodes=10, max_episode_timesteps=2000) print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times) try: runner.close() except AttributeError as e: pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('-a', '--agent-config', help="Agent configuration file") args = parser.parse_args() #From quickstart on docs #Network as list of layers #This is from mlp2_embedding_network.json network_spec = [ { "type": "dense", "size": 32 # "activation": "relu" }, { "type": "dense", "size": 32 # "activation": "relu" } ] DATAPATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) observedFile = os.path.join(DATAPATH,r"prnio.int") infoFile = os.path.join(DATAPATH,r"prnio.cfl") environment = PycrysfmlEnvironment(observedFile, infoFile) #get agent configuration if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") agent = Agent.from_spec( spec=agent_config, kwargs=dict( states=environment.states, actions=environment.actions, network=network_spec, ) ) #Use this line to resore a pre-trained agent #agent.restore_model(file="/mnt/storage/deepQmodel_chisq") runner = Runner( agent=agent, environment=environment, repeat_actions=1 ) rewardsLog = [] steps = [] def episode_finished(r): if r.episode % 10 == 0: rewardsLog.append(r.episode_rewards[-1]) steps.append(r.episode) if r.episode % 50 == 0: sps = r.timestep / (time.time() - r.start_time) file = open("/mnt/storage/trainingLog", "a") file.write("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}\n".format(ep=r.episode, ts=r.timestep, sps=sps)) file.write("Episode reward: {}\n".format(r.episode_rewards[-1])) file.write("Episode timesteps: {}\n".format(r.episode_timestep)) file.write("Average of last 500 rewards: {}\n".format(sum(r.episode_rewards[-500:]) / 500)) file.write("Average of last 100 rewards: {}\n".format(sum(r.episode_rewards[-100:]) / 100)) agent.save_model(directory="/mnt/storage/deepQmodel_simpleA_stdreward", append_timestep=False) return True runner.run( timesteps=60000000, episodes=5000, max_episode_timesteps=1000, deterministic=False, episode_finished=episode_finished ) #graph rewards plt.scatter(steps, rewardsLog) plt.savefig('/mnt/storage/rewardLog_simpleA_stdreward.png') runner.close()
def compute(self, config_id, config, budget, working_directory): if self.environment.max_episode_timesteps() is None: min_capacity = 1000 + config['batch_size'] else: min_capacity = self.environment.max_episode_timesteps( ) + config['batch_size'] max_capacity = 100000 capacity = min( max_capacity, max(min_capacity, config['memory'] * config['batch_size'])) frequency = max(16, int(config['frequency'] * config['batch_size'])) if config['ratio_based'] == 'yes': ratio_based = True clipping_value = config['clipping_value'] else: ratio_based = False clipping_value = 0.0 if config['baseline'] == 'no': baseline_policy = None baseline_objective = None baseline_optimizer = None estimate_horizon = False estimate_terminal = False estimate_advantage = False else: estimate_horizon = 'early' estimate_terminal = True estimate_advantage = (config['estimate_advantage'] == 'yes') if config['baseline'] == 'same-policy': baseline_policy = None baseline_objective = None baseline_optimizer = None elif config['baseline'] == 'auto': # other modes, shared network/policy etc !!! baseline_policy = dict( network=dict(type='auto', internal_rnn=False)) baseline_objective = dict(type='state_value', huber_loss=0.0, early_reduce=False) baseline_optimizer = dict( type='adam', learning_rate=config['baseline_learning_rate']) else: assert False if config['entropy_regularization'] < 3e-5: # yes/no better entropy_regularization = 0.0 else: entropy_regularization = config['entropy_regularization'] agent = dict( agent='tensorforce', policy=dict(network=dict(type='auto', internal_rnn=False)), memory=dict(type='replay', capacity=capacity), update=dict(unit='timesteps', batch_size=config['batch_size'], frequency=frequency), optimizer=dict(type='adam', learning_rate=config['learning_rate']), objective=dict(type='policy_gradient', ratio_based=ratio_based, clipping_value=clipping_value, early_reduce=False), reward_estimation=dict(horizon=config['horizon'], discount=config['discount'], estimate_horizon=estimate_horizon, estimate_actions=False, estimate_terminal=estimate_terminal, estimate_advantage=estimate_advantage), baseline_policy=baseline_policy, baseline_objective=baseline_objective, baseline_optimizer=baseline_optimizer, preprocessing=None, l2_regularization=0.0, entropy_regularization=entropy_regularization) # num_episodes = list() final_reward = list() max_reward = list() rewards = list() for n in range(round(budget)): runner = Runner(agent=agent, environment=self.environment) # performance_threshold = runner.environment.max_episode_timesteps() - agent['reward_estimation']['horizon'] # def callback(r, p): # return True runner.run(num_episodes=500, use_tqdm=False) runner.close() # num_episodes.append(len(runner.episode_rewards)) final_reward.append( float(np.mean(runner.episode_rewards[-20:], axis=0))) average_rewards = [ float(np.mean(runner.episode_rewards[n:n + 20], axis=0)) for n in range(len(runner.episode_rewards) - 20) ] max_reward.append(float(np.amax(average_rewards, axis=0))) rewards.append(list(runner.episode_rewards)) # mean_num_episodes = float(np.mean(num_episodes, axis=0)) mean_final_reward = float(np.mean(final_reward, axis=0)) mean_max_reward = float(np.mean(max_reward, axis=0)) # loss = mean_num_episodes - mean_final_reward - mean_max_reward loss = -mean_final_reward - mean_max_reward return dict(loss=loss, info=dict(rewards=rewards))
def main(): env = OpenAIGym("P3DX-v0") agent = DQNAgent( states=dict(type='float', shape=(80,80,4)), actions=dict(type='int', num_actions=7), network= [ dict( type="conv2d", size=16, window=[8,8], stride=4, activation="relu" ), dict( type="conv2d", size=32, window=[4,4], stride=2, activation="relu" ), dict( type="flatten" ), dict( type="dense", size=256 ) ], actions_exploration = dict( type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.1, timesteps=1000 ), memory=dict( type="replay", capacity=1000, include_next_states=True ), update_mode = dict( unit="timesteps", batch_size=16, frequency=4 ), discount = 0.99, entropy_regularization = None, double_q_model = True, optimizer = dict( type="adam", learning_rate=1e-4 ) ) try: agent.restore_model(directory="data/", file="data-117246") print("Found data!") except: print("Can't load data") SAVE_INTERVAL = 10 def episode_finished(r): #print(r.episode) if r.episode % SAVE_INTERVAL == 0: print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1)) print("Episode reward: {}".format(r.episode_rewards[-1])) print("Average of last {} rewards: {}\n".format(SAVE_INTERVAL, np.mean(r.episode_rewards[-SAVE_INTERVAL:]))) r.agent.save_model(directory="data/data", append_timestep=True) with open("reward_history.csv", "a") as csvfile: writer = csv.writer(csvfile) for reward in r.episode_rewards[-SAVE_INTERVAL:]: writer.writerow([r.episode, reward]) with open("episode_history.csv", "a") as csvfile: writer = csv.writer(csvfile) writer.writerow([r.episode, r.timestep]) ''' with open("individual_reward_history.csv", "a") as csvfile: writer = csv.writer(csvfile) writer.writerow([r.episode, r.episode_rewards[-1]]) ''' return True runner = Runner( agent = agent, # Agent object environment = env # Environment object ) max_episodes = 10000 max_timesteps = 50000000 runner.run(max_timesteps,max_episodes, episode_finished=episode_finished) runner.close()
def main(): '''CLI interface to bootstrap taining''' parser = argparse.ArgumentParser(description="Playground Flags.") parser.add_argument("--game", default="pommerman", help="Game to choose.") parser.add_argument("--config", default="PommeFFACompetition-v0", help="Configuration to execute. See env_ids in " "configs.py for options.") parser.add_argument("--agents", default="tensorforce::ppo,test::agents.SimpleAgent," "test::agents.SimpleAgent,test::agents.SimpleAgent", help="Comma delineated list of agent types and docker " "locations to run the agents.") parser.add_argument("--agent_env_vars", help="Comma delineated list of agent environment vars " "to pass to Docker. This is only for the Docker Agent." " An example is '0:foo=bar:baz=lar,3:foo=lam', which " "would send two arguments to Docker Agent 0 and one to" " Docker Agent 3.", default="") parser.add_argument("--record_pngs_dir", default=None, help="Directory to record the PNGs of the game. " "Doesn't record if None.") parser.add_argument("--record_json_dir", default=None, help="Directory to record the JSON representations of " "the game. Doesn't record if None.") parser.add_argument("--render", default=False, action='store_true', help="Whether to render or not. Defaults to False.") parser.add_argument("--game_state_file", default=None, help="File from which to load game state. Defaults to " "None.") parser.add_argument("--model_save_dir", default="./ppo_model/model", help="Directory to save the learnt models.") args = parser.parse_args() config = args.config record_pngs_dir = args.record_pngs_dir record_json_dir = args.record_json_dir agent_env_vars = args.agent_env_vars game_state_file = args.game_state_file # TODO: After https://github.com/MultiAgentLearning/playground/pull/40 # this is still missing the docker_env_dict parsing for the agents. agents = [ helpers.make_agent_from_string(agent_string, agent_id + 1000) for agent_id, agent_string in enumerate(args.agents.split(",")) ] env = make(config, agents, game_state_file) training_agent = None for agent in agents: # if type(agent) == TensorForceAgent: if agent.trainable: training_agent = agent env.set_training_agent(agent.agent_id) break if args.record_pngs_dir: assert not os.path.isdir(args.record_pngs_dir) os.makedirs(args.record_pngs_dir) if args.record_json_dir: assert not os.path.isdir(args.record_json_dir) os.makedirs(args.record_json_dir) # Create a Proximal Policy Optimization agent agent = training_agent.initialize(env) atexit.register(functools.partial(clean_up_agents, agents)) wrapped_env = WrappedEnv(env, visualize=args.render) runner = Runner(agent=agent, environment=wrapped_env) num_epi = 200000 vis_epi = 100 max_reward = -10.0 for i in range(num_epi // vis_epi): runner.run(episodes=vis_epi, max_episode_timesteps=2000) m_reward = np.mean(runner.episode_rewards[-vis_epi:]) m_step = np.mean(runner.episode_timesteps[-vis_epi:]) m_time = np.mean(runner.episode_times[-vis_epi:]) print("[Iter %s]: %.3f %.3f %.3f" % (i, m_reward, m_step, m_time)) sys.stdout.flush() if m_reward > max_reward: max_reward = m_reward agent.save_model(args.model_save_dir, False) print("[Save] max_reward=%s" % (max_reward)) try: runner.close() except AttributeError as e: pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', help="Choose actions deterministically") parser.add_argument('-M', '--mode', choices=('tmux', 'child'), default='tmux', help="Starter mode") parser.add_argument('-W', '--num-workers', type=int, default=1, help="Number of worker agents") parser.add_argument('-C', '--child', action='store_true', help="Child process") parser.add_argument('-P', '--parameter-server', action='store_true', help="Parameter server") parser.add_argument('-I', '--task-index', type=int, default=0, help="Task index") parser.add_argument('-K', '--kill', action='store_true', help="Kill runners") parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") parser.add_argument('-D', '--debug', action='store_true', help="Show debug outputs") args = parser.parse_args() session_name = 'OpenAI-' + args.gym_id shell = '/bin/bash' kill_cmds = [ "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers), "tmux kill-session -t {}".format(session_name), ] if args.kill: os.system("\n".join(kill_cmds)) return 0 if not args.child: # start up child processes target_script = os.path.abspath(inspect.stack()[0][1]) def wrap_cmd(session, name, cmd): if isinstance(cmd, list): cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) if args.mode == 'tmux': return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd)) elif args.mode == 'child': return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( cmd, args.logdir, session, name, args.logdir ) def build_cmd(ps, index): cmd_args = [ 'CUDA_VISIBLE_DEVICES=', sys.executable, target_script, args.gym_id, '--agent', os.path.join(os.getcwd(), args.agent), '--network', os.path.join(os.getcwd(), args.network), '--num-workers', args.num_workers, '--child', '--task-index', index ] if args.episodes is not None: cmd_args.append('--episodes') cmd_args.append(args.episodes) if args.timesteps is not None: cmd_args.append('--timesteps') cmd_args.append(args.timesteps) if args.max_episode_timesteps is not None: cmd_args.append('--max-episode-timesteps') cmd_args.append(args.max_episode_timesteps) if args.deterministic: cmd_args.append('--deterministic') if ps: cmd_args.append('--parameter-server') if args.debug: cmd_args.append('--debug') return cmd_args if args.mode == 'tmux': cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)] elif args.mode == 'child': cmds = ['mkdir -p {}'.format(args.logdir), 'rm -f {}/kill.sh'.format(args.logdir), 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), 'chmod +x {}/kill.sh'.format(args.logdir)] cmds.append(wrap_cmd(session_name, 'ps', build_cmd(ps=True, index=0))) for i in xrange(args.num_workers): name = 'worker{}'.format(i) if args.mode == 'tmux': cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell)) cmds.append(wrap_cmd(session_name, name, build_cmd(ps=False, index=i))) # add one PS call # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) print("\n".join(cmds)) os.system("\n".join(cmds)) return 0 ps_hosts = ['127.0.0.1:{}'.format(12222)] worker_hosts = [] port = 12223 for _ in range(args.num_workers): worker_hosts.append('127.0.0.1:{}'.format(port)) port += 1 cluster = {'ps': ps_hosts, 'worker': worker_hosts} cluster_spec = tf.train.ClusterSpec(cluster) environment = OpenAIGym(args.gym_id) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # log_levels[agent.log_level]) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") if args.parameter_server: agent['device'] = '/job:ps/task:{}'.format(args.task_index) # '/cpu:0' else: agent['device'] = '/job:worker/task:{}'.format(args.task_index) # '/cpu:0' agent['distributed'] = dict( cluster_spec=cluster_spec, task_index=args.task_index, parameter_server=args.parameter_server ) agent = Agent.from_spec( spec=agent, kwargs=dict( states=environment.states, actions=environment.actions, network=network ) ) logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id)) logger.info("Config:") logger.info(agent) runner = Runner( agent=agent, environment=environment, repeat_actions=1 ) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 def episode_finished(r): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info("Finished episode {} after overall {} timesteps. Steps Per Second {}".format( r.agent.episode, r.agent.timestep, steps_per_second) ) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True runner.run( timesteps=args.timesteps, episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished ) runner.close()
def compute(self, config_id, config, budget, working_directory): if self.environment.max_episode_timesteps() == None: min_capacity = 1000 + config['batch_size'] else: min_capacity = self.environment.max_episode_timesteps() + config['batch_size'] max_capacity = 100000 capacity = min(max_capacity, max(min_capacity, config['memory'] * config['batch_size'])) frequency = max(4, int(config['frequency'] * config['batch_size'])) if config['baseline'] == 'no': baseline_policy = None baseline_objective = None baseline_optimizer = None estimate_horizon = False estimate_terminal = False estimate_advantage = False else: estimate_horizon = 'late' estimate_advantage = (config['estimate_advantage'] == 'yes') if config['baseline'] == 'same-policy': baseline_policy = None baseline_objective = None baseline_optimizer = None elif config['baseline'] == 'auto': # other modes, shared network/policy etc !!! baseline_policy = dict(network=dict(type='auto', internal_rnn=False)) baseline_objective = dict( type='value', value='state', huber_loss=0.0, early_reduce=False ) baseline_optimizer = dict( type='adam', learning_rate=config['baseline_learning_rate'] ) else: assert False if config['l2_regularization'] < 3e-5: # yes/no better l2_regularization = 0.0 else: l2_regularization = config['l2_regularization'] if config['entropy_regularization'] < 3e-5: # yes/no better entropy_regularization = 0.0 else: entropy_regularization = config['entropy_regularization'] # Set agent configuration according to configspace print("### Set agent configuration according to configspace") agent = dict( agent='tensorforce', policy=dict(network=dict(type='auto', internal_rnn=False)), memory=dict(type='replay', capacity=capacity), # replay, recent update=dict(unit='timesteps', batch_size=config['batch_size'], frequency=frequency), # timesteps, episode optimizer=dict(type='adam', learning_rate=config['learning_rate']), objective=dict( type='policy_gradient', ratio_based=True, clipping_value=0.1, early_reduce=False ), reward_estimation=dict( horizon=config['horizon'], discount=config['discount'], estimate_horizon=estimate_horizon, estimate_actions=False, estimate_terminal=False, estimate_advantage=estimate_advantage ), baseline_policy=baseline_policy, baseline_objective=baseline_objective, baseline_optimizer=baseline_optimizer, preprocessing=None, l2_regularization=l2_regularization, entropy_regularization=entropy_regularization ) # Set state representation according to configspace print("### Set state representation according to configspace") # Example state configurations to evaluate config_state = None if config['state'] == 0: config_state = [] elif config['state'] == 1: config_state = ['bin_buffer_fill'] elif config['state'] == 2: config_state = ['bin_buffer_fill', 'distance_to_action'] elif config['state'] == 3: config_state = ['bin_buffer_fill', 'distance_to_action', 'bin_machine_failure'] elif config['state'] == 4: config_state = ['bin_buffer_fill', 'distance_to_action', 'bin_machine_failure', 'order_waiting_time'] self.environment.environment.parameters.update({'TRANSP_AGENT_STATE': config_state}) self.environment.environment.parameters.update({'TRANSP_AGENT_REWARD': config['reward']}) #self.environment.environment.parameters.update({'TRANSP_AGENT_REWARD_INVALID_ACTION': config['reward_invalid']}) #self.environment.environment.parameters.update({'TRANSP_AGENT_REWARD_OBJECTIVE_WEIGHTS': config['reward_weighted']}) self.environment.environment.parameters.update({'TRANSP_AGENT_MAX_INVALID_ACTIONS': config['max_invalid_actions']}) self.environment.environment.parameters.update({'TRANSP_AGENT_WAITING_TIME_ACTION': config['waiting_if_invalid_actions']}) # num_episodes = list() final_reward = list() max_reward = list() rewards = list() for n in range(round(budget)): runner = Runner(agent=agent, environment=self.environment) #runner = Runner(agent='config/ppo2.json', environment=self.environment) # performance_threshold = runner.environment.max_episode_timesteps() - agent['reward_estimation']['horizon'] # def callback(r, p): # return True runner.run(num_episodes=NUM_EPISODES, use_tqdm=False) runner.close() # num_episodes.append(len(runner.episode_rewards)) final_reward.append(float(np.mean(runner.episode_rewards[-20:], axis=0))) average_rewards = [ float(np.mean(runner.episode_rewards[n: n + 20], axis=0)) for n in range(len(runner.episode_rewards) - 20) ] max_reward.append(float(np.amax(average_rewards, axis=0))) rewards.append(list(runner.episode_rewards)) # mean_num_episodes = float(np.mean(num_episodes, axis=0)) mean_final_reward = float(np.mean(final_reward, axis=0)) mean_max_reward = float(np.mean(max_reward, axis=0)) # loss = mean_num_episodes - mean_final_reward - mean_max_reward loss = -mean_final_reward - mean_max_reward return dict(loss=loss, info=dict(rewards=rewards))
def test_blogpost_introduction_runner(self): from .minimal_test import MinimalTest from tensorforce.agents import DQNAgent from tensorforce.execution import Runner environment = MinimalTest(specification={'int': ()}) network_spec = [dict(type='dense', size=32)] agent = DQNAgent(states=environment.states, actions=environment.actions, network=network_spec, memory=dict(type='replay', include_next_states=True, capacity=100), target_sync_frequency=50) runner = Runner(agent=agent, environment=environment) def episode_finished(runner): if runner.episode % 100 == 0: print(sum(runner.episode_rewards[-100:]) / 100) return runner.episode < 100 \ or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:]) # runner.run(episodes=1000, episode_finished=episode_finished) runner.run(episodes=10, episode_finished=episode_finished ) # Only 10 episodes for this test runner.close() ### Code block: next agent = DQNAgent(states=environment.states, actions=environment.actions, network=network_spec, memory=dict(type='replay', include_next_states=True, capacity=100), target_sync_frequency=50) # max_episodes = 1000 max_episodes = 10 # Only 10 episodes for this test max_timesteps = 2000 episode = 0 episode_rewards = list() while True: state = environment.reset() agent.reset() timestep = 0 episode_reward = 0 while True: action = agent.act(states=state) state, terminal, reward = environment.execute(action=action) agent.observe(terminal=terminal, reward=reward) timestep += 1 episode_reward += reward if terminal or timestep == max_timesteps: break episode += 1 episode_rewards.append(episode_reward) if all(reward >= 1.0 for reward in episode_rewards[-100:]) or episode == max_episodes: break agent.close() environment.close()
class TensorforceTradingStrategy(TradingStrategy): """A trading strategy capable of self tuning, training, and evaluating with Tensorforce.""" def __init__(self, environment: TradingEnvironment, agent_spec: Dict = None, network_spec: Dict = None, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent_spec: A specification dictionary for the `Tensorforce` agent. network_sepc: A specification dictionary for the `Tensorforce` agent's model network. kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._environment = environment self._max_episode_timesteps = kwargs.get('max_episode_timesteps', None) if agent_spec and network_spec: self._agent_spec = agent_spec self._network_spec = network_spec self._agent = Agent.from_spec(spec=agent_spec, kwargs=dict( network=network_spec, states=environment.states, actions=environment.actions)) self._runner = Runner(agent=self._agent, environment=environment) @property def agent(self) -> Agent: """A Tensorforce `Agent` instance that will learn the strategy.""" return self._agent @property def max_episode_timesteps(self) -> int: """The maximum timesteps per episode.""" return self._max_episode_timesteps @max_episode_timesteps.setter def max_episode_timesteps(self, max_episode_timesteps: int): self._max_episode_timesteps = max_episode_timesteps def restore_agent(self, path: str, model_path: str = None): """Deserialize the strategy's learning agent from a file. Arguments: path: The `str` path of the file the agent specification is stored in. The `.json` file extension will be automatically appended if not provided. model_path (optional): The `str` path of the file or directory the agent checkpoint is stored in. If not provided, the `model_path` will default to `{path_without_dot_json}/agents`. """ path_with_ext = path if path.endswith('.json') else f'{path}.json' with open(path_with_ext) as json_file: spec = json.load(json_file) self._agent_spec = spec.agent self._network_spec = spec.network self._agent = Agent.from_spec(spec=self._agent_spec, kwargs=dict( network=self._network_spec, states=self._environment.states, actions=self._environment.actions)) path_without_ext = path_with_ext.replace('.json', '') model_path = model_path or f'{path_without_ext}/agent' self._agent.restore_model(file=model_path) self._runner = Runner(agent=self._agent, environment=self._environment) def save_agent(self, path: str, model_path: str = None, append_timestep: bool = False): """Serialize the learning agent to a file for restoring later. Arguments: path: The `str` path of the file to store the agent specification in. The `.json` file extension will be automatically appended if not provided. model_path (optional): The `str` path of the directory to store the agent checkpoints in. If not provided, the `model_path` will default to `{path_without_dot_json}/agents`. append_timestep: Whether the timestep should be appended to filename to prevent overwriting previous models. Defaults to `False`. """ path_with_ext = path if path.endswith('.json') else f'{path}.json' spec = {'agent': self._agent_spec, 'network': self._network_spec} with open(path_with_ext, 'w') as json_file: json.dump(spec, json_file) path_without_ext = path_with_ext.replace('.json', '') model_path = model_path or f'{path_without_ext}/agent' if not os.path.exists(model_path): os.makedirs(model_path) self._agent.save_model(directory=model_path, append_timestep=True) def _finished_episode_cb(self, runner: Runner) -> bool: n_episodes = runner.episode n_timesteps = runner.episode_timestep avg_reward = np.mean(runner.episode_rewards) print(f"Finished episode {n_episodes} after {n_timesteps} timesteps.") print(f"Average episode reward: {avg_reward})") return True def tune(self, steps: int = None, episodes: int = None, callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: raise NotImplementedError def run( self, steps: int = None, episodes: int = None, should_train: bool = False, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: testing = not should_train self._runner.run(testing=testing, num_timesteps=steps, num_episodes=episodes, max_episode_timesteps=self._max_episode_timesteps, episode_finished=self._finished_episode_cb) self._runner.close() n_episodes = self._runner.episode n_timesteps = self._runner.timestep avg_reward = np.mean(self._runner.episode_rewards) print("Finished running strategy.") print(f"Total episodes: {n_episodes} ({n_timesteps} timesteps).") print(f"Average reward: {avg_reward}.")
def main(): parser = argparse.ArgumentParser(description='Tensorforce runner') parser.add_argument( 'agent', help='Agent (configuration JSON file, name, or library module)' ) parser.add_argument( 'environment', help='Environment (name, configuration JSON file, or library module)' ) # Agent arguments parser.add_argument( '-n', '--network', type=str, default=None, help='Network (configuration JSON file, name, or library module)' ) # Environment arguments parser.add_argument( '-l', '--level', type=str, default=None, help='Level or game id, like `CartPole-v1`, if supported' ) parser.add_argument( '--visualize', action='store_true', help='Visualize agent--environment interaction, if supported' ) parser.add_argument( '-i', '--import-modules', type=str, default=None, help='Import comma-separated modules required for environment' ) # Runner arguments parser.add_argument('-t', '--timesteps', type=int, default=None, help='Number of timesteps') parser.add_argument('-e', '--episodes', type=int, default=None, help='Number of episodes') parser.add_argument( '-m', '--max-episode-timesteps', type=int, default=None, help='Maximum number of timesteps per episode' ), parser.add_argument( '--mean-horizon', type=int, default=10, help='Number of timesteps/episodes for mean reward computation' ) parser.add_argument('-v', '--evaluation', action='store_true', help='Evaluation mode') parser.add_argument( '-s', '--save-best-agent', action='store_true', help='Save best-performing agent' ) # Logging arguments parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of repetitions') parser.add_argument( '-p', '--path', type=str, default=None, help='Logging path, directory plus filename without extension' ) parser.add_argument('--seaborn', action='store_true', help='Use seaborn') args = parser.parse_args() if args.import_modules is not None: for module in args.import_modules.split(','): importlib.import_module(name=module) if args.path is None: callback = None else: assert os.path.splitext(args.path)[1] == '' assert args.episodes is not None and args.visualize is not None rewards = [list() for _ in range(args.episodes)] timesteps = [list() for _ in range(args.episodes)] seconds = [list() for _ in range(args.episodes)] agent_seconds = [list() for _ in range(args.episodes)] def callback(r): rewards[r.episode - 1].append(r.episode_reward) timesteps[r.episode - 1].append(r.episode_timestep) seconds[r.episode - 1].append(r.episode_second) agent_seconds[r.episode - 1].append(r.episode_agent_second) return True if args.visualize: if args.level is None: environment = Environment.create(environment=args.environment, visualize=True) else: environment = Environment.create( environment=args.environment, level=args.level, visualize=True ) else: if args.level is None: environment = Environment.create(environment=args.environment) else: environment = Environment.create(environment=args.environment, level=args.level) for _ in range(args.repeat): agent_kwargs = dict() if args.network is not None: agent_kwargs['network'] = args.network if args.max_episode_timesteps is not None: assert environment.max_episode_timesteps() is None or \ environment.max_episode_timesteps() == args.max_episode_timesteps agent_kwargs['max_episode_timesteps'] = args.max_episode_timesteps agent = Agent.create(agent=args.agent, environment=environment, **agent_kwargs) runner = Runner(agent=agent, environment=environment) runner.run( num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, callback=callback, mean_horizon=args.mean_horizon, evaluation=args.evaluation # save_best_model=args.save_best_model ) runner.close() if args.path is not None: if not os.path.isdir(os.path.split(args.path)[0]): os.makedirs(os.path.split(args.path)[0], exist_ok=True) with open(args.path + '.json', 'w') as filehandle: filehandle.write( json.dumps(dict( rewards=rewards, timesteps=timesteps, seconds=seconds, agent_seconds=agent_seconds )) ) if args.seaborn: import seaborn as sns sns.set() xs = np.arange(len(rewards)) min_rewards = np.amin(rewards, axis=1) max_rewards = np.amax(rewards, axis=1) median_rewards = np.median(rewards, axis=1) plt.plot(xs, median_rewards, color='green', linewidth=2.0) plt.fill_between(xs, min_rewards, max_rewards, color='green', alpha=0.4) plt.xlabel('episodes') plt.ylabel('reward') plt.savefig(fname=(args.path + '.png'))
class TensorforceTradingStrategy(TradingStrategy): """A trading strategy capable of self tuning, training, and evaluating with Tensorforce.""" def __init__(self, environment: 'TradingEnvironment', agent_spec: any, save_best_agent: bool = False, **kwargs): """ Arguments: environment: A `TradingEnvironment` instance for the agent to trade within. agent: A `Tensorforce` agent or agent specification. save_best_agent (optional): The runner will automatically save the best agent kwargs (optional): Optional keyword arguments to adjust the strategy. """ self._max_episode_timesteps = kwargs.get('max_episode_timesteps', False) self._environment = Environment.create( environment='gym', level=environment, max_episode_timesteps=self._max_episode_timesteps) self._agent = Agent.create(agent=agent_spec, environment=self._environment) self._runner = Runner(agent=self._agent, environment=self._environment, save_best_agent=save_best_agent) @property def agent(self) -> Agent: """A Tensorforce `Agent` instance that will learn the strategy.""" return self._agent @property def max_episode_timesteps(self) -> int: """The maximum timesteps per episode.""" return self._max_episode_timesteps @max_episode_timesteps.setter def max_episode_timesteps(self, max_episode_timesteps: int): self._max_episode_timesteps = max_episode_timesteps def restore_agent(self, directory: str, filename: str = None): """Deserialize the strategy's learning agent from a file. Arguments: directory: The `str` path of the directory the agent checkpoint is stored in. filename (optional): The `str` path of the file the agent specification is stored in. The `.json` file extension will be automatically appended if not provided. """ self._agent = Agent.load(directory, filename=filename) self._runner = Runner(agent=self._agent, environment=self._environment) def save_agent(self, directory: str, filename: str = None, append_timestep: bool = False): """Serialize the learning agent to a file for restoring later. Arguments: directory: The `str` path of the directory the agent checkpoint is stored in. filename (optional): The `str` path of the file the agent specification is stored in. The `.json` file extension will be automatically appended if not provided. append_timestep: Whether the timestep should be appended to filename to prevent overwriting previous models. Defaults to `False`. """ self._agent.save(directory=directory, filename=filename, append_timestep=append_timestep) def _finished_episode_cb(self, runner: Runner) -> bool: n_episodes = runner.episodes n_timesteps = runner.episode_timesteps avg_reward = np.mean(runner.episode_rewards) print("Finished episode {} after {} timesteps.".format( n_episodes, n_timesteps)) print("Average episode reward: {})".format(avg_reward)) return True def tune(self, steps: int = None, episodes: int = None, callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: raise NotImplementedError def run( self, steps: int = None, episodes: int = None, evaluation: bool = False, episode_callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame: self._runner.run(evaluation=evaluation, num_timesteps=steps, num_episodes=episodes, callback=episode_callback) n_episodes = self._runner.episodes n_timesteps = self._runner.timesteps avg_reward = np.mean(self._runner.episode_rewards) print("Finished running strategy.") print("Total episodes: {} ({} timesteps).".format( n_episodes, n_timesteps)) print("Average reward: {}.".format(avg_reward)) self._runner.close() return self._environment.environment._exchange._performance
def main(max_timesteps): max_episodes = None #max_timesteps = 86400000000*days env = real_adapter(pong) network_spec = [ #dict(type='flatten'), dict(type='dense', size=11, activation='tanh'), #dict(type='dense', size=20, activation='tanh'), #dict(type='dense', size=32, activation='tanh'), ] exploration = dict(type='epsilon_decay', timesteps=max_timesteps) summarizer = dict( directory="./models/"+str(datetime.now()).replace(' ', ''), steps=10000, seconds=None, labels=[ #'rewards', #'actions', 'inputs', 'gradients', 'configuration', ], meta_dict=dict( description='July 2: Trying 11 node hidden layer.', layers=str(network_spec), timesteps=max_timesteps, exploration=exploration, ), ) agent = NAFAgent( states=env.states, actions=env.actions, network=network_spec, #actions_exploration=exploration, #summarizer=summarizer, #batch_size=64 ) runner = Runner(agent, env) report_episodes = 1 #global prev global prev prev = 0 def episode_finished(r): global prev if r.episode % report_episodes == 0: #print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep-prev)) #print("Episode reward: {}".format(r.episode_rewards[-1])) print(r.episode_rewards[-1]) prev = r.timestep #print("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(num_episodes=max_episodes, num_timesteps=max_timesteps, max_episode_timesteps=None, episode_finished=episode_finished) agent.save_model(directory='./results/NAF/'+str(datetime.now()).replace(' ', '')+'/model') runner.close() print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
class UnittestBase(object): """ Unit-test base class. """ # Unittest num_updates = None num_episodes = None num_timesteps = None # Environment timestep_range = (1, 5) states = dict(bool_state=dict(type='bool', shape=(1, )), int_state=dict(type='int', shape=(2, ), num_values=4), float_state=dict(type='float', shape=(1, 1, 2)), bounded_state=dict(type='float', shape=(), min_value=-0.5, max_value=0.5)) actions = dict(bool_action=dict(type='bool', shape=(1, )), int_action=dict(type='int', shape=(2, ), num_values=4), float_action=dict(type='float', shape=(1, 1)), bounded_action=dict(type='float', shape=(2, ), min_value=-0.5, max_value=0.5)) # Exclude action types exclude_bool_action = False exclude_int_action = False exclude_float_action = False exclude_bounded_action = False # Agent agent = dict(update=4, network=dict(type='auto', size=8, internal_rnn=2), objective='policy_gradient', reward_estimation=dict(horizon=2)) # Tensorforce config require_observe = False require_all = False def start_tests(self, name=None): """ Start unit-test method. """ if name is None: sys.stdout.write('\n{} {}: '.format( datetime.now().strftime('%H:%M:%S'), self.__class__.__name__[4:])) else: sys.stdout.write('\n{} {} ({}): '.format( datetime.now().strftime('%H:%M:%S'), self.__class__.__name__[4:], name)) sys.stdout.flush() def finished_test(self, assertion=None): """ Finished unit-test. """ if assertion is None: assertion = True else: self.assertTrue(expr=assertion) if assertion: sys.stdout.write('.') sys.stdout.flush() def prepare(self, environment=None, timestep_range=None, states=None, actions=None, exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, require_observe=False, require_all=False, **agent): """ Generic unit-test preparation. """ Layer.layers = None if environment is None: if states is None: states = deepcopy(self.__class__.states) if actions is None: actions = deepcopy(self.__class__.actions) if exclude_bool_action or self.__class__.exclude_bool_action: actions.pop('bool_action') if exclude_int_action or self.__class__.exclude_int_action: actions.pop('int_action') if exclude_float_action or self.__class__.exclude_float_action: actions.pop('float_action') if exclude_bounded_action or self.__class__.exclude_bounded_action: actions.pop('bounded_action') if timestep_range is None: timestep_range = self.__class__.timestep_range environment = UnittestEnvironment( states=states, actions=actions, timestep_range=timestep_range, ) elif timestep_range is not None: raise TensorforceError.unexpected() environment = Environment.create(environment=environment) for key, value in self.__class__.agent.items(): if key not in agent: agent[key] = value if self.__class__.require_all or require_all: config = None elif self.__class__.require_observe or require_observe: config = dict(api_functions=['reset', 'act', 'observe']) else: config = dict(api_functions=['reset', 'act']) agent = Agent.create(agent=agent, environment=environment, config=config) return agent, environment def unittest(self, num_updates=None, num_episodes=None, num_timesteps=None, environment=None, timestep_range=None, states=None, actions=None, exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, require_observe=False, require_all=False, **agent): """ Generic unit-test. """ agent, environment = self.prepare( environment=environment, timestep_range=timestep_range, states=states, actions=actions, exclude_bool_action=exclude_bool_action, exclude_int_action=exclude_int_action, exclude_float_action=exclude_float_action, exclude_bounded_action=exclude_bounded_action, require_observe=require_observe, require_all=require_all, **agent) self.runner = Runner(agent=agent, environment=environment) assert (num_updates is not None) + (num_episodes is not None) + \ (num_timesteps is not None) <= 1 if num_updates is None and num_episodes is None and num_timesteps is None: num_updates = self.__class__.num_updates num_episodes = self.__class__.num_episodes num_timesteps = self.__class__.num_timesteps if num_updates is None and num_episodes is None and num_timesteps is None: num_updates = 2 assert (num_updates is not None) + (num_episodes is not None) + \ (num_timesteps is not None) == 1 evaluation = not any([ require_all, require_observe, self.__class__.require_all, self.__class__.require_observe ]) self.runner.run(num_episodes=num_episodes, num_timesteps=num_timesteps, num_updates=num_updates, max_episode_timesteps=agent.max_episode_timesteps, use_tqdm=False, evaluation=evaluation) self.runner.close() self.finished_test()
def main(args): version = 'v1' episodes = args.episodes visualize = args.visualize config = ffa_v0_fast_env() env = Pomme(**config["env_kwargs"]) env.seed(0) agent = PPOAgent( states=dict(type='float', shape=(11, 11, 12)), actions=dict(type='int', num_actions=env.action_space.n), network=[ # (9, 9, 12) dict(type='conv2d', size=12, window=3, stride=1), # (7, 7, 8) dict(type='conv2d', size=8, window=3, stride=1), # (5, 5, 4) dict(type='conv2d', size=4, window=3, stride=1), # (100) dict(type='flatten'), dict(type='dense', size=64, activation='relu'), dict(type='dense', size=16, activation='relu'), ], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) if os.path.exists(os.path.join('models', version, 'checkpoint')): agent.restore_model(directory=os.path.join('models', version)) agents = [] for agent_id in range(3): # agents.append(RandomAgent(config["agent"](agent_id, config["game_type"]))) # agents.append(StoppingAgent(config["agent"](agent_id, config["game_type"]))) agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) agent_id += 1 agents.append( TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) wrapped_env = WrappedEnv(env, agent, visualize) runner = Runner(agent=agent, environment=wrapped_env) try: runner.run(episodes=episodes, max_episode_timesteps=100) except Exception as e: raise e finally: agent.save_model(directory=os.path.join('models', version, 'agent')) win_count = len( list(filter(lambda reward: reward == 1, runner.episode_rewards))) print('Stats: ') print(f' runner.episode_rewards = {runner.episode_rewards}') print(f' win count = {win_count}') try: runner.close() except AttributeError as e: raise e
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('--monitor', help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) environment = OpenAIGym(gym_id=args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent, kwargs=dict(states=environment.states, actions=environment.actions, network=network)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}" .format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True runner.run(timesteps=args.timesteps, episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
def main(): parser = argparse.ArgumentParser(description="Playground Flags.") parser.add_argument("--game", default="pommerman", help="Game to choose.") parser.add_argument("--config", default="PommeFFA-v0", help="Configuration to execute. See env_ids in " "configs.py for options.") parser.add_argument("--agents", default="tensorforce::ppo,test::agents.SimpleAgent," "test::agents.SimpleAgent,test::agents.SimpleAgent", help="Comma delineated list of agent types and docker " "locations to run the agents.") parser.add_argument("--agent_env_vars", help="Comma delineated list of agent environment vars " "to pass to Docker. This is only for the Docker Agent." " An example is '0:foo=bar:baz=lar,3:foo=lam', which " "would send two arguments to Docker Agent 0 and one to" " Docker Agent 3.", default="") parser.add_argument("--record_pngs_dir", default=None, help="Directory to record the PNGs of the game. " "Doesn't record if None.") parser.add_argument("--record_json_dir", default=None, help="Directory to record the JSON representations of " "the game. Doesn't record if None.") parser.add_argument("--render", default=True, help="Whether to render or not. Defaults to True.") parser.add_argument("--game_state_file", default=None, help="File from which to load game state. Defaults to " "None.") args = parser.parse_args() config = args.config record_pngs_dir = args.record_pngs_dir record_json_dir = args.record_json_dir agent_env_vars = args.agent_env_vars game_state_file = args.game_state_file # TODO: After https://github.com/MultiAgentLearning/playground/pull/40 # this is still missing the docker_env_dict parsing for the agents. agents = [ helpers.make_agent_from_string(agent_string, agent_id + 1000) for agent_id, agent_string in enumerate(args.agents.split(",")) ] env = make(config, agents, game_state_file) training_agent = None for agent in agents: if type(agent) == TensorForceAgent: training_agent = agent env.set_training_agent(agent.agent_id) break if args.record_pngs_dir: assert not os.path.isdir(args.record_pngs_dir) os.makedirs(args.record_pngs_dir) if args.record_json_dir: assert not os.path.isdir(args.record_json_dir) os.makedirs(args.record_json_dir) # Create a Proximal Policy Optimization agent agent = training_agent.initialize(env) atexit.register(functools.partial(clean_up_agents, agents)) wrapped_env = WrappedEnv(env, visualize=args.render) runner = Runner(agent=agent, environment=wrapped_env) runner.run(episodes=10, max_episode_timesteps=2000) print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times) try: runner.close() except AttributeError as e: pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', help="ID of the game mode") parser.add_argument('--hide', dest='hide', action='store_const', const=True, default=False, help="Hide output window") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # configurable!!! environment = MazeExplorer(mode_id=args.mode, visible=not args.hide) if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent_config, kwargs=dict(states=environment.states, actions=environment.actions, network=network)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) runner = Runner(agent=agent, environment=environment, repeat_actions=1) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: sps = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}" .format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) runner.close() logger.info( "Learning finished. Total episodes: {ep}".format(ep=runner.episode)) environment.close()
def main(argv): logging_basicConfig(level=INFO) logger = getLogger(__file__) logger.setLevel(INFO) environment = OpenAIGym( gym_id='MoveToBeacon-bbueno5000-v0', monitor=FLAGS.monitor, monitor_safe=FLAGS.monitor_safe, monitor_video=FLAGS.monitor_video, visualize=FLAGS.visualize) # if FLAGS.agent_config is not None: # with open(FLAGS.agent_config, 'r') as fp: # agent_config = json.load(fp=fp) # else: # raise TensorForceError( # "No agent configuration provided.") # if FLAGS.network is not None: # with open(FLAGS.network, 'r') as fp: # network = json.load(fp=fp) # else: # network = None # logger.info( # "No network configuration provided.") network_spec = [ dict(type='flatten'), dict(type='dense', size=32), dict(type='dense', size=32) ] agent = PPOAgent( states=environment.states, actions=environment.actions, network=network_spec ) if FLAGS.load: load_dir = path.dirname(FLAGS.load) if not path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format(load_dir)) agent.restore_model(FLAGS.load) if FLAGS.save: save_dir = path.dirname(FLAGS.save) if not path.isdir(save_dir): try: mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) if FLAGS.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner( agent=agent, environment=environment, repeat_actions=1) if FLAGS.debug: report_episodes = 1 else: report_episodes = 100 logger.info( "Starting {agent} for Environment {env}".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time() - r.start_time) logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format( r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) if FLAGS.save and FLAGS.save_episodes is not None and not r.episode % FLAGS.save_episodes: logger.info("Saving agent to {}".format(FLAGS.save)) r.agent.save_model(FLAGS.save) return True runner.run( num_timesteps=FLAGS.timesteps, num_episodes=FLAGS.num_episodes, max_episode_timesteps=FLAGS.max_episode_timesteps, deterministic=FLAGS.deterministic, episode_finished=episode_finished, testing=FLAGS.test, sleep=FLAGS.sleep) runner.close() logger.info("Learning completed.") logger.info("Total episodes: {ep}".format(ep=runner.agent.episode))
def main(): '''CLI interface to bootstrap taining''' parser = argparse.ArgumentParser(description="Playground Flags.") parser.add_argument("--game", default="pommerman", help="Game to choose.") parser.add_argument("--config", default="PommeFFACompetition-v0", help="Configuration to execute. See env_ids in " "configs.py for options.") parser.add_argument("--agents", default="tensorforce::ppo,test::agents.SimpleAgent," "test::agents.SimpleAgent,test::agents.SimpleAgent", help="Comma delineated list of agent types and docker " "locations to run the agents.") parser.add_argument("--agent_env_vars", help="Comma delineated list of agent environment vars " "to pass to Docker. This is only for the Docker Agent." " An example is '0:foo=bar:baz=lar,3:foo=lam', which " "would send two arguments to Docker Agent 0 and one to" " Docker Agent 3.", default="") parser.add_argument("--record_pngs_dir", default=None, help="Directory to record the PNGs of the game. " "Doesn't record if None.") parser.add_argument("--record_json_dir", default=None, help="Directory to record the JSON representations of " "the game. Doesn't record if None.") parser.add_argument("--render", default=False, action='store_true', help="Whether to render or not. Defaults to False.") parser.add_argument("--game_state_file", default=None, help="File from which to load game state. Defaults to " "None.") parser.add_argument("--checkpoint", default="models/ppo", help="Directory where checkpoint file stored to.") parser.add_argument("--num_of_episodes", default="10", help="Number of episodes") parser.add_argument("--max_timesteps", default="2000", help="Number of steps") args = parser.parse_args() config = args.config record_pngs_dir = args.record_pngs_dir record_json_dir = args.record_json_dir agent_env_vars = args.agent_env_vars game_state_file = args.game_state_file checkpoint = args.checkpoint num_of_episodes = int(args.num_of_episodes) max_timesteps = int(args.max_timesteps) # TODO: After https://github.com/MultiAgentLearning/playground/pull/40 # this is still missing the docker_env_dict parsing for the agents. agents = [ helpers.make_agent_from_string(agent_string, agent_id + 1000) for agent_id, agent_string in enumerate(args.agents.split(",")) ] env = make(config, agents, game_state_file) training_agent = None for agent in agents: if type(agent) == TensorForceAgent: training_agent = agent env.set_training_agent(agent.agent_id) break if args.record_pngs_dir: assert not os.path.isdir(args.record_pngs_dir) os.makedirs(args.record_pngs_dir) if args.record_json_dir: assert not os.path.isdir(args.record_json_dir) os.makedirs(args.record_json_dir) # Create a Proximal Policy Optimization agent agent = training_agent.initialize(env) atexit.register(functools.partial(clean_up_agents, agents)) wrapped_env = WrappedEnv(env, visualize=args.render) runner = Runner(agent=agent, environment=wrapped_env) runner.run(episodes=num_of_episodes, max_episode_timesteps=max_timesteps) print("Stats: ", runner.episode_rewards[-30:], runner.episode_timesteps, runner.episode_times) agent.save_model(checkpoint) rewards = runner.episode_rewards win = rewards.count(1) lose = rewards.count(-1) draw = rewards.count(0) total = win + lose + draw ratio = round((win / total) * 100.0, 2) print("Results ({}%) = Win({}), Lose({}), Draw({})".format( ratio, win, lose, draw)) try: runner.close() except AttributeError as e: pass
def main(): # SET BASIC PARAMETERS start_time = time.time() random_seed = 21 agent_save_period = 500 visualize_period = 1 run_number = 965 load_agent = False agent_filename = '371-P33-27-PPO-2000' to_visualize = False # Set logging level logging.basicConfig(level=logging.INFO) logger = logging.getLogger() logger.setLevel(logging.INFO) # if args.import_modules is not None: # for module in args.import_modules.split(','): # importlib.import_module(name=module) environment = Environment.create(environment='gym', level='EnvTestContinuousR-v2', visualize=to_visualize) # Set random seed for environment environment.environment.env.seed(random_seed) environment.environment.env.set_reward(3) environment.environment.env.set_random(3) environment.environment.env.set_reward_scale(6) # Initialize Agent-Network-Model objects with open( 'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\examples\\configs\\ppo-new3.json', 'r') as fp: agentSpec = json.load(fp=fp) with open( 'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\examples\\configs\\mlp2_network-new.json', 'r') as fp: network = json.load(fp=fp) # agentSpec['update_mode'].update(batch_size=24) # agentSpec['update_mode'].update(frequency=24) #agentSpec['baseline']['sizes'] = [512,512] agentSpec['optimization_steps'] = 9 agentSpec['network']['layers'][0]['size'] = 128 agentSpec['network']['layers'][1]['size'] = 129 agentSpec['critic_network']['layers'][0]['size'] = 126 agentSpec['critic_network']['layers'][1]['size'] = 127 agentSpec['batch_size'] = 13 agentSpec['subsampling_fraction'] = 0.8 agentSpec['critic_optimizer']['num_steps'] = 11 agentSpec['likelihood_ratio_clipping'] = 0.2 # network[0].update(size=512) # network[1].update(size=512) # agentSpec['network']['layers'] = network # agentSpec['critic_network']['layers'] = network agent = Agent.create( max_episode_timesteps=3000, agent=agentSpec, environment=environment, seed=random_seed # kwargs=dict( # states=environment.states, # actions=environment.actions, # network=network, # #random_seed=random_seed ) agent.initialize() # print("Agent memory ", agent.memory['capacity']) # print("Agent baseline steps", agent.baseline_optimizer['num_steps']) # print("Agent optimizer steps", agent.optimizer['num_steps']) if load_agent: agent.restore_model( directory= 'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave', file=agent_filename) runner = Runner(agent=agent, environment=environment) # logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) # Naming variables nNum = str(run_number).zfill(3) task = environment.environment.env.task if task == 'LIFT': nTask = 'L' else: nTask = 'P' nReward = environment.environment.env.reward_level nRandom = environment.environment.env.rand_level nSeed = str(random_seed).zfill(2) nAlg = 'PPO' nName = ("{}-{}{}{}-{}-{}".format(nNum, nTask, nReward, nRandom, nSeed, nAlg)) def episode_finished(r, id_=None): # if r.episode == 1: # r.agent.restore_model('C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave') save_period = 5 if r.episodes % visualize_period == 0: if to_visualize: environment.visualize = True # Set to true to visualize else: environment.visualize = False if r.episodes % save_period == 0: with open( 'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\{}.csv' .format(nName), 'a+') as csv: for reward in r.episode_rewards[-save_period:]: csv.write("{:2.2f}\n".format(reward)) # print("\nSaving, yo!") if r.episodes == 1 or (r.episodes % agent_save_period == 0): logger.info("\nSaving agent to {} at episode {}".format( 'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave\\{}' .format(nName), r.episodes)) # r.agent.save( # directory='C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave\\{}{}'.format(nName, r.episodes), # append_timestep=False) return True def episode_finish(r, id_=None): print(r) runner.run( num_episodes=2000, num_timesteps=10000000, max_episode_timesteps=500, num_repeat_actions=1, # Callback callback=episode_finished, callback_episode_frequency=1, callback_timestep_frequency=None, # Tqdm use_tqdm=True, mean_horizon=100, # Evaluation evaluation=False, evaluation_callback=None, evaluation_frequency=None, max_evaluation_timesteps=None, num_evaluation_iterations=0) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))