def __init__( self, states_spec, actions_spec, network_spec, device=None, session_config=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2 ): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`. Available optimizer types include standard TensorFlow optimizers, `natural_gradient`, and `evolutionary`. Consult the optimizer test or example configurations for more. discount: Float specifying reward discount factor. normalize_rewards: Boolean flag specifying whether to normalize rewards, default False. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each preprocessor is a dict containing a type and optional necessary arguments. exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise) and arguments. reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size. memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`. first_update: Int describing at which time step the first update is performed. Should be larger than batch size. update_frequency: Int specifying number of observe steps to perform until an update is executed. repeat_update: Int specifying how many update steps are performed per update, where each update step implies sampling a batch from the memory and passing it to the model. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict( type='adam', learning_rate=1e-3 ) else: self.optimizer = optimizer if memory is None: memory = dict( type='replay', capacity=100000 ) else: self.memory = memory self.network_spec = network_spec self.device = device self.session_config = session_config self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update ) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-a', '--agent-config', help="Agent configuration file") args = parser.parse_args() #From quickstart on docs #Network as list of layers #This is from mlp2_embedding_network.json network_spec = [ { "type": "dense", "size": 32 # "activation": "relu" }, { "type": "dense", "size": 32 # "activation": "relu" } ] DATAPATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) observedFile = os.path.join(DATAPATH,r"prnio.int") infoFile = os.path.join(DATAPATH,r"prnio.cfl") environment = PycrysfmlEnvironment(observedFile, infoFile) #get agent configuration if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") agent = Agent.from_spec( spec=agent_config, kwargs=dict( states=environment.states, actions=environment.actions, network=network_spec, ) ) #Use this line to resore a pre-trained agent #agent.restore_model(file="/mnt/storage/deepQmodel_chisq") runner = Runner( agent=agent, environment=environment, repeat_actions=1 ) rewardsLog = [] steps = [] def episode_finished(r): if r.episode % 10 == 0: rewardsLog.append(r.episode_rewards[-1]) steps.append(r.episode) if r.episode % 50 == 0: sps = r.timestep / (time.time() - r.start_time) file = open("/mnt/storage/trainingLog", "a") file.write("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}\n".format(ep=r.episode, ts=r.timestep, sps=sps)) file.write("Episode reward: {}\n".format(r.episode_rewards[-1])) file.write("Episode timesteps: {}\n".format(r.episode_timestep)) file.write("Average of last 500 rewards: {}\n".format(sum(r.episode_rewards[-500:]) / 500)) file.write("Average of last 100 rewards: {}\n".format(sum(r.episode_rewards[-100:]) / 100)) agent.save_model(directory="/mnt/storage/deepQmodel_simpleA_stdreward", append_timestep=False) return True runner.run( timesteps=60000000, episodes=5000, max_episode_timesteps=1000, deterministic=False, episode_finished=episode_finished ) #graph rewards plt.scatter(steps, rewardsLog) plt.savefig('/mnt/storage/rewardLog_simpleA_stdreward.png') runner.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', help="Choose actions deterministically") parser.add_argument('-M', '--mode', choices=('tmux', 'child'), default='tmux', help="Starter mode") parser.add_argument('-W', '--num-workers', type=int, default=1, help="Number of worker agents") parser.add_argument('-C', '--child', action='store_true', help="Child process") parser.add_argument('-P', '--parameter-server', action='store_true', help="Parameter server") parser.add_argument('-I', '--task-index', type=int, default=0, help="Task index") parser.add_argument('-K', '--kill', action='store_true', help="Kill runners") parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory") parser.add_argument('-D', '--debug', action='store_true', help="Show debug outputs") args = parser.parse_args() session_name = 'OpenAI-' + args.gym_id shell = '/bin/bash' kill_cmds = [ "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format( 12222 + args.num_workers), "tmux kill-session -t {}".format(session_name), ] if args.kill: os.system("\n".join(kill_cmds)) return 0 if not args.child: # start up child processes target_script = os.path.abspath(inspect.stack()[0][1]) def wrap_cmd(session, name, cmd): if isinstance(cmd, list): cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd) if args.mode == 'tmux': return 'tmux send-keys -t {}:{} {} Enter'.format( session, name, shlex_quote(cmd)) elif args.mode == 'child': return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format( cmd, args.logdir, session, name, args.logdir) def build_cmd(ps, index): cmd_args = [ # 'CUDA_VISIBLE_DEVICES=', sys.executable, target_script, args.gym_id, '--agent', os.path.join(os.getcwd(), args.agent), '--network', os.path.join(os.getcwd(), args.network), '--num-workers', args.num_workers, '--child', '--task-index', index ] if args.episodes is not None: cmd_args.append('--episodes') cmd_args.append(args.episodes) if args.timesteps is not None: cmd_args.append('--timesteps') cmd_args.append(args.timesteps) if args.max_episode_timesteps is not None: cmd_args.append('--max-episode-timesteps') cmd_args.append(args.max_episode_timesteps) if args.deterministic: cmd_args.append('--deterministic') if ps: cmd_args.append('--parameter-server') if args.debug: cmd_args.append('--debug') return cmd_args if args.mode == 'tmux': cmds = kill_cmds + [ 'tmux new-session -d -s {} -n ps'.format(session_name) ] elif args.mode == 'child': cmds = [ 'mkdir -p {}'.format(args.logdir), 'rm -f {}/kill.sh'.format(args.logdir), 'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir), 'chmod +x {}/kill.sh'.format(args.logdir) ] cmds.append(wrap_cmd(session_name, 'ps', build_cmd(ps=True, index=0))) for i in xrange(args.num_workers): name = 'worker{}'.format(i) if args.mode == 'tmux': cmds.append('tmux new-window -t {} -n {} -d {}'.format( session_name, name, shell)) cmds.append( wrap_cmd(session_name, name, build_cmd(ps=False, index=i))) # add one PS call # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell)) print("\n".join(cmds)) os.system("\n".join(cmds)) return 0 ps_hosts = ['127.0.0.1:{}'.format(12222)] worker_hosts = [] port = 12223 for _ in range(args.num_workers): worker_hosts.append('127.0.0.1:{}'.format(port)) port += 1 cluster = {'ps': ps_hosts, 'worker': worker_hosts} cluster_spec = tf.train.ClusterSpec(cluster) environment = OpenAIGym(args.gym_id) logger = logging.getLogger() logger.setLevel(logging.INFO) # log_levels[agent.log_level]) stdout_logger = logging.StreamHandler(sys.stdout) stdout_logger.setLevel(logging.INFO) logger.addHandler(stdout_logger) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") if args.parameter_server: agent['device'] = '/job:ps/task:{}'.format(args.task_index) # '/cpu:0' else: agent['device'] = '/job:worker/task:{}'.format( args.task_index) # '/cpu:0' agent['execution'] = dict( type='distributed', distributed_spec=dict(cluster_spec=cluster_spec, task_index=args.task_index, job='ps' if args.parameter_server else 'worker', protocol='grpc')) agent = Agent.from_spec(spec=agent, kwargs=dict(states=environment.states, actions=environment.actions, network=network)) logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format( gym_id=args.gym_id)) logger.info("Config:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 def episode_finished(r): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {} after overall {} timesteps. Steps Per Second {}" .format(r.agent.episode, r.agent.timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True runner.run(timesteps=args.timesteps, episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) runner.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('rom', help="File path of the rom") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument( '-w', '--workers', help="Number of threads to run where the model is shared", type=int, default=16) parser.add_argument('-fs', '--frame-skip', help="Number of frames to repeat action", type=int, default=1) parser.add_argument('-rap', '--repeat-action-probability', help="Repeat action probability", type=float, default=0.0) parser.add_argument('-lolt', '--loss-of-life-termination', help="Loss of life counts as terminal state", action='store_true') parser.add_argument('-lolr', '--loss-of-life-reward', help="Loss of life reward/penalty. EX: -1 to penalize", type=float, default=0.0) parser.add_argument( '-ea', '--epsilon-annealing', help='Create separate epislon annealing schedules per thread', action='store_true') parser.add_argument('-ds', '--display-screen', action='store_true', default=False, help="Display emulator screen") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # configurable!!! logger.addHandler(logging.StreamHandler(sys.stdout)) environments = [ ALE(args.rom, frame_skip=args.frame_skip, repeat_action_probability=args.repeat_action_probability, loss_of_life_termination=args.loss_of_life_termination, loss_of_life_reward=args.loss_of_life_reward, display_screen=args.display_screen) for _ in range(args.workers) ] if args.network_spec: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent_configs = [] if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") for i in range(args.workers): worker_config = deepcopy(agent_config) # Optionally overwrite epsilon final values if "explorations_spec" in worker_config and worker_config[ 'explorations_spec']['type'] == "epsilon_anneal": if args.epsilon_annealing: # epsilon final values are [0.5, 0.1, 0.01] with probabilities [0.3, 0.4, 0.3] epsilon_final = np.random.choice([0.5, 0.1, 0.01], p=[0.3, 0.4, 0.3]) worker_config['explorations_spec'][ "epsilon_final"] = epsilon_final agent_configs.append(worker_config) # Let the first agent create the model # Manually assign model logger.info(agent_configs[0]) agent = Agent.from_spec(spec=agent_configs[0], kwargs=dict(states=environments[0].states, actions=environments[0].actions, network=network_spec)) agents = [agent] for i in range(args.workers - 1): config = agent_configs[i] agent_type = config.pop('type', None) worker = WorkerAgentGenerator(AgentsDictionary[agent_type])( states=environments[0].states, actions=environments[0].actions, network=network_spec, model=agent.model, **config) agents.append(worker) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_configs[0]) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) def episode_finished(stats): if args.debug: logger.info( "Thread {t}. Finished episode {ep} after {ts} timesteps. Reward {r}" .format(t=stats['thread_id'], ep=stats['episode'], ts=stats['timestep'], r=stats['episode_reward'])) return True def summary_report(r): et = time.time() logger.info('=' * 40) logger.info('Current Step/Episode: {}/{}'.format( r.global_step, r.global_episode)) logger.info('SPS: {}'.format(r.global_step / (et - r.start_time))) reward_list = r.episode_rewards if len(reward_list) > 0: logger.info('Max Reward: {}'.format(np.max(reward_list))) logger.info("Average of last 500 rewards: {}".format( sum(reward_list[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(reward_list[-100:]) / 100)) logger.info('=' * 40) # Create runners threaded_runner = ThreadedRunner(agents, environments, repeat_actions=1, save_path=args.save, save_episodes=args.save_episodes) logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environments[0])) threaded_runner.run(summary_interval=100, episode_finished=episode_finished, summary_report=summary_report) threaded_runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=threaded_runner.global_episode))
def main(): parser = argparse.ArgumentParser() parser.add_argument('rom', help="File path of the rom") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument('-fs', '--frame-skip', help="Number of frames to repeat action", type=int, default=1) parser.add_argument('-rap', '--repeat-action-probability', help="Repeat action probability", type=float, default=0.0) parser.add_argument('-lolt', '--loss-of-life-termination', help="Loss of life counts as terminal state", action='store_true') parser.add_argument('-lolr', '--loss-of-life-reward', help="Loss of life reward/penalty. EX: -1 to penalize", type=float, default=0.0) parser.add_argument('-ds', '--display-screen', action='store_true', default=False, help="Display emulator screen") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # configurable!!! logger.addHandler(logging.StreamHandler(sys.stdout)) environment = ALE(args.rom, frame_skip=args.frame_skip, repeat_action_probability=args.repeat_action_probability, loss_of_life_termination=args.loss_of_life_termination, loss_of_life_reward=args.loss_of_life_reward, display_screen=args.display_screen) if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent = Agent.from_spec( spec=agent_config, kwargs=dict( states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec ) ) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner = Runner( agent=agent, environment=environment, repeat_actions=1 ) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: sps = r.timestep / (time.time() - r.start_time) logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) runner.close() logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) environment.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('path', help="Path to Pycolab game definition file") parser.add_argument('-i', '--import-modules', help="Import module(s) required for environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('--visualize', action='store_true', default=False, help="Enable Pycolab game's visualization") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger() logger.setLevel(logging.INFO) if args.import_modules is not None: for module in args.import_modules.split(','): importlib.import_module(name=module) if args.path is not None: sys.path.append(os.path.dirname(os.path.expanduser(args.path))) game_name = os.path.splitext(os.path.basename(args.path))[0] try: game_env = importlib.import_module(game_name) except: raise TensorForceError( "Could not get game {0} from path {1}".format( game_name, args.path)) environment = DMPycolab(game=game_env.make_game(), ui=game_env.get_ui(), visualize=args.visualize) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent, kwargs=dict( states=environment.states, actions=environment.actions, network=network, )) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}" .format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) if args.save and args.save_episodes is not None and not r.episode % args.save_episodes: logger.info("Saving agent to {}".format(args.save)) r.agent.save_model(args.save) return True runner.run( num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, episode_finished=episode_finished, ) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('--monitor', help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('--visualize', action='store_true', default=False, help="Enable OpenAI Gym's visualization") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") parser.add_argument('-te', '--test', action='store_true', default=False, help="Test agent without learning.") parser.add_argument('-sl', '--sleep', type=float, default=None, help="Slow down simulation by sleeping for x seconds (fractions allowed).") parser.add_argument('--job', type=str, default=None, help="For distributed mode: The job type of this agent.") parser.add_argument('--task', type=int, default=0, help="For distributed mode: The task index of this agent.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) environment = OpenAIGym( gym_id=args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video, visualize=args.visualize ) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") agent = Agent.from_spec( spec=agent, kwargs=dict( states=environment.states, actions=environment.actions, network=network, ) ) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.restore_model(args.load) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner( agent=agent, environment=environment, repeat_actions=1 ) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format( r.agent.episode, r.episode_timestep, steps_per_second )) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}". format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}". format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) if args.save and args.save_episodes is not None and not r.episode % args.save_episodes: logger.info("Saving agent to {}".format(args.save)) r.agent.save_model(args.save) return True runner.run( num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished, testing=args.test, sleep=args.sleep ) runner.close() logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.agent.episode))
def __init__(self, states_spec, actions_spec, network_spec, device=None, session_config=None, scope='dqn-nstep', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, double_q_model=False, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, keep_last_timestep=True): """ Creates a DQN n-step agent. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`. Available optimizer types include standard TensorFlow optimizers, `natural_gradient`, and `evolutionary`. Consult the optimizer test or example configurations for more. discount: Float specifying reward discount factor. normalize_rewards: Boolean flag specifying whether to normalize rewards, default False. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each preprocessor is a dict containing a type and optional necessary arguments. exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise) and arguments. reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: keep_last_timestep: """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer self.network_spec = network_spec self.device = device self.session_config = session_config self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.double_q_model = double_q_model self.huber_loss = huber_loss super(DQNNstepAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, keep_last_timestep=keep_last_timestep)
def update(self, states, internals, actions, terminal, reward, return_loss_per_instance=False): fetches = [self.optimization] # Optionally fetch loss per instance if return_loss_per_instance: fetches.append(self.loss_per_instance) terminal = np.asarray(terminal) batched = (terminal.ndim == 1) if batched: # TEMP: Random sampling fix if self.random_sampling_fix: feed_dict = { state_input: states[name][0] for name, state_input in self.states_input.items() } feed_dict.update({ state_input: states[name][1] for name, state_input in self.next_states_input.items() }) else: feed_dict = { state_input: states[name] for name, state_input in self.states_input.items() } feed_dict.update({ internal_input: internals[n] for n, internal_input in enumerate(self.internals_input) }) feed_dict.update({ action_input: actions[name] for name, action_input in self.actions_input.items() }) feed_dict[self.terminal_input] = terminal feed_dict[self.reward_input] = reward else: # TEMP: Random sampling fix if self.random_sampling_fix: raise TensorForceError("Unbatched version not covered by fix.") else: feed_dict = { state_input: (states[name], ) for name, state_input in self.states_input.items() } feed_dict.update({ internal_input: (internals[n], ) for n, internal_input in enumerate(self.internals_input) }) feed_dict.update({ action_input: (actions[name], ) for name, action_input in self.actions_input.items() }) feed_dict[self.terminal_input] = (terminal, ) feed_dict[self.reward_input] = (reward, ) feed_dict[self.deterministic_input] = True feed_dict[self.update_input] = True fetched = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) if return_loss_per_instance: return fetched[1]
def get_batch(self, batch_size, next_states=False): """ Samples a batch of the specified size according to priority. Args: batch_size: The batch size next_states: A boolean flag indicating whether 'next_states' values should be included Returns: A dict containing states, actions, rewards, terminals, internal states (and next states) """ if batch_size > len(self.observations): raise TensorForceError("Batch size is larger than number of observations in memory.") states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items()} internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internals_config] actions = {name: np.zeros((batch_size,) + tuple(action['shape']), dtype=util.np_dtype(action['type'])) for name, action in self.actions_spec.items()} terminal = np.zeros((batch_size,), dtype=util.np_dtype('bool')) reward = np.zeros((batch_size,), dtype=util.np_dtype('float')) if next_states: next_states = {name: np.zeros((batch_size,) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items()} next_internals = [np.zeros((batch_size,) + shape, dtype) for shape, dtype in self.internals_config] self.batch_indices = list() not_sampled_index = self.none_priority_index sum_priorities = sum(priority for priority, _ in self.observations if priority is not None) for n in xrange(batch_size): if not_sampled_index < len(self.observations): _, observation = self.observations[not_sampled_index] index = not_sampled_index not_sampled_index += 1 elif sum_priorities / self.capacity < util.epsilon: index = randrange(self.none_priority_index) while index in self.batch_indices: index = randrange(self.none_priority_index) _, observation = self.observations[index] else: while True: sample = random() for index, (priority, observation) in enumerate(self.observations): sample -= priority / sum_priorities if sample < 0.0 or index >= self.none_priority_index: break if index not in self.batch_indices: break for name, state in states.items(): state[n] = observation[0][name] for k, internal in enumerate(internals): internal[n] = observation[1][k] for name, action in actions.items(): action[n] = observation[2][name] terminal[n] = observation[3] reward[n] = observation[4] if next_states: for name, next_state in next_states.items(): next_state[n] = observation[5][name] for k, next_internal in enumerate(next_internals): next_internal[n] = observation[6][k] self.batch_indices.append(index) if next_states: return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals) else: return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward)
def __init__( self, states_spec, actions_spec, network_spec, device=None, session_config=None, scope='ppo', saver_spec=None, summary_spec=None, distributed_spec=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=1e-2, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=1000, keep_last_timestep=True, likelihood_ratio_clipping=None, step_optimizer=None, optimization_steps=10 ): # random_sampling=True # Sampling strategy for replay memory """ Creates a proximal policy optimization agent (PPO), ([Schulman et al., 2017] (https://openai-public.s3-us-west-2.amazonaws.com/blog/2017-07/ppo/ppo-arxiv.pdf). Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. discount: Float specifying reward discount factor. normalize_rewards: Boolean flag specifying whether to normalize rewards, default False. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. baseline_mode: String specifying baseline mode, `states` for a separate baseline per state, `network` for sharing parameters with the training network. baseline: Optional dict specifying baseline type (e.g. `mlp`, `cnn`), and its layer sizes. Consult examples/configs for full example configurations. baseline_optimizer: Optional dict specifying an optimizer and its parameters for the baseline following the same conventions as the main optimizer. gae_lambda: Optional float specifying lambda parameter for generalized advantage estimation. preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each preprocessor is a dict containing a type and optional necessary arguments. exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise) and arguments. reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: Int specifying number of samples collected via `observe` before an update is executed. keep_last_timestep: Boolean flag specifying whether last sample is kept, default True. likelihood_ratio_clipping: Optional clipping of likelihood ratio between old and new policy. step_optimizer: Optimizer dict specification for optimizer used in each PPO update step, defaults to Adam if None. optimization_steps: Int specifying number of optimization steps to execute on the collected batch using the step optimizer. ` """ if network_spec is None: raise TensorForceError("No network_spec provided.") self.network_spec = network_spec self.device = device self.session_config = session_config self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.baseline_mode = baseline_mode self.baseline = baseline self.baseline_optimizer = baseline_optimizer self.gae_lambda = gae_lambda self.likelihood_ratio_clipping = likelihood_ratio_clipping if step_optimizer is None: step_optimizer = dict( type='adam', learning_rate=1e-4 ) self.optimizer = dict( type='multi_step', optimizer=step_optimizer, num_steps=optimization_steps ) super(PPOAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, keep_last_timestep=keep_last_timestep )
def __init__(self, states_spec, actions_spec, network_spec, device=None, scope='ppo', saver_spec=None, summary_spec=None, distributed_spec=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=1e-2, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=1000, keep_last_timestep=True, likelihood_ratio_clipping=None, step_optimizer=None, optimization_steps=10): # random_sampling=True # Sampling strategy for replay memory """ Creates a proximal policy optimization agent (PPO), ([Schulman et al., 2017] (https://openai-public.s3-us-west-2.amazonaws.com/blog/2017-07/ppo/ppo-arxiv.pdf). Args: states_spec: actions_spec: network_spec: device: scope: saver_spec: summary_spec: distributed_spec: discount: normalize_rewards: variable_noise: distributions_spec: entropy_regularization: baseline_mode: baseline: baseline_optimizer: gae_lambda: preprocessing: exploration: reward_preprocessing: batched_observe: batch_size: keep_last_timestep: likelihood_ratio_clipping: step_optimizer: optimization_steps: """ if network_spec is None: raise TensorForceError("No network_spec provided.") self.network_spec = network_spec self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.baseline_mode = baseline_mode self.baseline = baseline self.baseline_optimizer = baseline_optimizer self.gae_lambda = gae_lambda self.likelihood_ratio_clipping = likelihood_ratio_clipping if step_optimizer is None: step_optimizer = dict(type='adam', learning_rate=1e-4) self.optimizer = dict(type='multi_step', optimizer=step_optimizer, num_steps=optimization_steps) super(PPOAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, keep_last_timestep=keep_last_timestep)
def tf_apply(self, x, update=False): if util.rank(x) != 2: raise TensorForceError( 'Invalid input rank for linear layer: {}, must be 2.'.format(util.rank(x)) ) if self.size is None: # If size is None than Output Matches Input, required for Skip Connections self.size = x.shape[1].value weights_shape = (x.shape[1].value, self.size) if self.weights_init is None: stddev = min(0.1, sqrt(2.0 / (x.shape[1].value + self.size))) self.weights_init = tf.random_normal_initializer(mean=0.0, stddev=stddev, dtype=tf.float32) elif isinstance(self.weights_init, float): if self.weights_init == 0.0: self.weights_init = tf.zeros_initializer(dtype=tf.float32) else: self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32) elif isinstance(self.weights_init, list): self.weights_init = np.asarray(self.weights_init, dtype=np.float32) if self.weights_init.shape != weights_shape: raise TensorForceError( 'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape) ) self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32) elif isinstance(self.weights_init, np.ndarray): if self.weights_init.shape != weights_shape: raise TensorForceError( 'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape) ) self.weights_init = tf.constant_initializer(value=self.weights_init, dtype=tf.float32) elif isinstance(self.weights_init, tf.Tensor): if util.shape(self.weights_init) != weights_shape: raise TensorForceError( 'Weights shape {} does not match expected shape {} '.format(self.weights_init.shape, weights_shape) ) bias_shape = (self.size,) if isinstance(self.bias_init, bool): if self.bias_init: self.bias_init = tf.zeros_initializer(dtype=tf.float32) else: self.bias_init = None elif isinstance(self.bias_init, float): if self.bias_init == 0.0: self.bias_init = tf.zeros_initializer(dtype=tf.float32) else: self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32) elif isinstance(self.bias_init, list): self.bias_init = np.asarray(self.bias_init, dtype=np.float32) if self.bias_init.shape != bias_shape: raise TensorForceError( 'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape) ) self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32) elif isinstance(self.bias_init, np.ndarray): if self.bias_init.shape != bias_shape: raise TensorForceError( 'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape) ) self.bias_init = tf.constant_initializer(value=self.bias_init, dtype=tf.float32) elif isinstance(self.bias_init, tf.Tensor): if util.shape(self.bias_init) != bias_shape: raise TensorForceError( 'Bias shape {} does not match expected shape {} '.format(self.bias_init.shape, bias_shape) ) if isinstance(self.weights_init, tf.Tensor): self.weights = self.weights_init else: self.weights = tf.get_variable( name='W', shape=weights_shape, dtype=tf.float32, initializer=self.weights_init ) x = tf.matmul(a=x, b=self.weights) if self.bias_init is None: self.bias = None else: if isinstance(self.bias_init, tf.Tensor): self.bias = self.bias_init else: self.bias = tf.get_variable(name='b', shape=bias_shape, dtype=tf.float32, initializer=self.bias_init) x = tf.nn.bias_add(value=x, bias=self.bias) return x
def __init__(self, states_spec, actions_spec, config, **kwargs): # States and actions specifications self.states_spec = states_spec self.actions_spec = actions_spec # Discount factor self.discount = config.discount # Reward normalization assert isinstance(config.normalize_rewards, bool) self.normalize_rewards = config.normalize_rewards # Variable noise assert config.variable_noise is None or config.variable_noise > 0.0 self.variable_noise = config.variable_noise # TensorFlow summaries self.summary_labels = set(config.summary_labels or ()) # Variables and summaries self.variables = dict() self.all_variables = dict() self.summaries = list() if not config.local_model or not config.replica_model: # If not local_model mode or not internal global model self.default_graph = tf.Graph().as_default() self.graph = self.default_graph.__enter__() if config.cluster_spec is None: if config.parameter_server or config.replica_model or config.local_model: raise TensorForceError( "Invalid config value for distributed mode.") self.device = config.device self.global_model = None elif config.parameter_server: if config.replica_model or config.local_model: raise TensorForceError( "Invalid config value for distributed mode.") self.device = config.device self.global_model = None elif config.replica_model: self.device = tf.train.replica_device_setter( worker_device=config.device, cluster=config.cluster_spec) self.global_model = None elif config.local_model: if config.replica_model: raise TensorForceError( "Invalid config value for distributed mode.") self.device = config.device global_config = config.copy() global_config.set(key='replica_model', value=True) self.global_model = self.__class__(states_spec=states_spec, actions_spec=actions_spec, config=global_config, **kwargs) else: raise TensorForceError( "Invalid config value for distributed mode.") with tf.device(device_name_or_function=self.device): # Timestep and episode # TODO: various modes !!! if self.global_model is None: # TODO: Variables seem to re-initialize in the beginning every time a runner starts self.timestep = tf.get_variable(name='timestep', dtype=tf.int32, initializer=0, trainable=False) self.episode = tf.get_variable(name='episode', dtype=tf.int32, initializer=0, trainable=False) else: self.timestep = self.global_model.timestep self.episode = self.global_model.episode with tf.name_scope(name=config.scope): def custom_getter(getter, name, registered=False, **kwargs): variable = getter( name=name, **kwargs) # Top-level, hence no 'registered' if not registered and not name.startswith('optimization'): self.all_variables[name] = variable if kwargs.get('trainable', True): self.variables[name] = variable if 'variables' in self.summary_labels: summary = tf.summary.histogram(name=name, values=variable) self.summaries.append(summary) return variable # Create placeholders, tf functions, internals, etc self.initialize(custom_getter=custom_getter) # Input tensors states = self.get_states(states=self.state_inputs) internals = [ tf.identity(input=internal) for internal in self.internal_inputs ] actions = self.get_actions(actions=self.action_inputs) terminal = tf.identity(input=self.terminal_input) reward = self.get_reward(states=states, internals=internals, terminal=terminal, reward=self.reward_input) # Stop gradients for input preprocessing states = { name: tf.stop_gradient(input=state) for name, state in states.items() } actions = { name: tf.stop_gradient(input=action) for name, action in actions.items() } reward = tf.stop_gradient(input=reward) # Optimizer if config.optimizer is None: self.optimizer = None elif config.local_model and not config.replica_model: # If local_model mode and not internal global model self.optimizer = GlobalOptimizer( optimizer=config.optimizer) else: self.optimizer = Optimizer.from_spec(spec=config.optimizer) # Create output fetch operations self.create_output_operations(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, deterministic=self.deterministic) if config.local_model and config.replica_model: # If local_model mode and internal global model return # Local and global initialize operations if config.local_model: init_op = tf.variables_initializer( var_list=(self.global_model.get_variables( include_non_trainable=True))) local_init_op = tf.variables_initializer( var_list=(self.get_variables(include_non_trainable=True))) else: init_op = tf.variables_initializer(var_list=(self.get_variables( include_non_trainable=True))) local_init_op = None # Summary operation if len(self.get_summaries()) > 0: summary_op = tf.summary.merge(inputs=self.get_summaries()) else: summary_op = None # TODO: MonitoredSession or so? self.supervisor = tf.train.Supervisor( is_chief=(config.task_index == 0), init_op=init_op, local_init_op=local_init_op, logdir=config.model_directory, summary_op=summary_op, global_step=self.timestep, save_summaries_secs=config.summary_frequency, save_model_secs=config.save_frequency # checkpoint_basename='model.ckpt' # session_manager=None ) # tf.ConfigProto(device_filters=['/job:ps', '/job:worker/task:{}/cpu:0'.format(self.task_index)]) if config.parameter_server: self.server = tf.train.Server( server_or_cluster_def=config.cluster_spec, job_name='ps', task_index=config.task_index, # config=tf.ConfigProto(device_filters=["/job:ps"]) # config=tf.ConfigProto( # inter_op_parallelism_threads=2, # log_device_placement=True # ) ) # Param server does nothing actively self.server.join() elif config.cluster_spec is not None: self.server = tf.train.Server( server_or_cluster_def=config.cluster_spec, job_name='worker', task_index=config.task_index, # config=tf.ConfigProto(device_filters=["/job:ps"]) # config=tf.ConfigProto( # inter_op_parallelism_threads=2, # log_device_placement=True # ) ) self.managed_session = self.supervisor.managed_session( master=self.server.target, start_standard_services=True) self.session = self.managed_session.__enter__() else: self.managed_session = self.supervisor.managed_session( start_standard_services=True) self.session = self.managed_session.__enter__()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="ID of the gym environment") parser.add_argument('-a', '--agent', default='DQNAgent') parser.add_argument('-c', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-config', help="Network configuration file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000*60, help="Maximum number of timesteps per episode") # parser.add_argument('-m', '--monitor', help="Save results to this directory") # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() env = OpenAIUniverse(args.gym_id) env.configure(remotes=1) default = dict( repeat_actions=1, actions=env.actions, states=env.states, max_episode_length=args.max_timesteps ) if args.agent_config: config = Configuration.from_json(args.agent_config) else: config = Configuration() config.default(default) if args.network_config: network_config = Configuration.from_json(args.network_config).network_layers else: if config.network_layers: network_config = config.network_layers else: raise TensorForceError("Error: No network configuration provided.") if args.debug: print("Configuration:") print(config) logger = logging.getLogger(__name__) logger.setLevel(log_levels[config.log_level]) stack = None agent = create_agent(args.agent, config, network_config) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.load_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(config) runner = Runner(agent, env, preprocessor=stack, repeat_actions=config.repeat_actions) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) runner.save_model(args.save, args.save_episodes) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: sps = r.total_timesteps / (time.time() - r.start_time) logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) return True logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=env)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode)) if args.monitor: env.gym.monitor.close() env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-P', '--port', default=6025, help= "Port on which the UE4 Game listens on for incoming RL-client connections" ) parser.add_argument('-H', '--host', default=None, help="Hostname of the UE4 Game (default: localhost)") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") parser.add_argument('-R', '--random-test-run', action="store_true", help="Do a quick random test run on the env") args = parser.parse_args() # logging.basicConfig(filename="logfile.txt", level=logging.INFO) logging.basicConfig(stream=sys.stderr) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # We have to connect this remote env to get the specs. # We also discretize axis-mappings b/c we will use a deep q-network. # Use num_ticks==6 to match Nature paper by Mnih et al. # ("human cannot press fire button with more than 10Hz", dt=1/60) # TODO: Need to build in capturing and concat'ing last 4 images (plus 8-bit conversion!) into 1 input state signal. # TODO: Use pre-processor for that. environment = UE4Environment(host=args.host, port=args.port, connect=True, discretize_actions=True, num_ticks=6) environment.seed(200) # Do a quick random test-run with image capture of the first n images -> then exit after 1000 steps. if args.random_test_run: # Reset the env. s = environment.reset() img = Image.fromarray( s, "RGB" if len(environment.states["shape"]) == 3 else "L") # Save first received image as a sanity-check. img.save("reset.png") for i in range(1000): s, is_terminal, r = environment.execute(actions=random.choice( range(environment.actions["num_actions"]))) if i < 10: img = Image.fromarray(s, "RGB") img.save("{:03d}.png".format(i)) logging.debug("i={} r={} term={}".format(i, r, is_terminal)) if is_terminal: environment.reset() quit() if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent_config, kwargs=dict(states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {} after {} timesteps. Steps Per Second {}". format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True runner.run(timesteps=args.timesteps, episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
def main(): logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) console_handler = logging.StreamHandler() console_handler.setFormatter( logging.Formatter( "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s" )) logger.addHandler(console_handler) parser = argparse.ArgumentParser() parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") args = parser.parse_args() if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent_config['states_preprocessing'] = [{'type': 'flatten'}] logger.info("Start training") environment = OpenSim(env_id=1, visualize=True) agent = Agent.from_spec(spec=agent_config, kwargs=dict( states=environment.states, actions=environment.actions, network=network_spec, )) runner = Runner(agent=agent, environment=environment, repeat_actions=1) def episode_finished(r): if r.episode % 100 == 0: sps = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}" .format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Episode timesteps: {}".format(r.episode_timestep)) logger.info("Episode largest tile: {}".format( r.environment.largest_tile)) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True runner.run(timesteps=6000000, episodes=2, max_episode_timesteps=10000, episode_finished=episode_finished) terminal = False state = environment.reset() while not terminal: action = agent.act(state) state, terminal, reward = environment.execute(action) runner.close()
def __init__( self, states_spec, actions_spec, batched_observe=1000, scope='learning_agent', # parameters specific to LearningAgents summary_spec=None, network_spec=None, discount=0.99, device=None, session_config=None, saver_spec=None, distributed_spec=None, optimizer=None, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None ): """ Initializes the learning agent. Args: summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. discount (float): The reward discount factor. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`. Available optimizer types include standard TensorFlow optimizers, `natural_gradient`, and `evolutionary`. Consult the optimizer test or example configurations for more. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). states_preprocessing_spec: Optional list of states preprocessors to apply to state (e.g. `image_resize`, `greyscale`). explorations_spec: Optional dict specifying action exploration type (epsilon greedy or Gaussian noise). reward_preprocessing_spec: Optional dict specifying reward preprocessing. distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. """ # TensorFlow summaries & Configuration Meta Parameter Recorder options self.summary_spec = summary_spec if self.summary_spec is None: self.summary_labels = set() else: self.summary_labels = set(self.summary_spec.get('labels', ())) self.meta_param_recorder = None # if 'configuration' in self.summary_labels or 'print_configuration' in self.summary_labels: if any(k in self.summary_labels for k in ['configuration', 'print_configuration']): self.meta_param_recorder = MetaParameterRecorder(inspect.currentframe()) if 'meta_dict' in self.summary_spec: # Custom Meta Dictionary passed self.meta_param_recorder.merge_custom(self.summary_spec['meta_dict']) if 'configuration' in self.summary_labels: # Setup for TensorBoard population self.summary_spec['meta_param_recorder_class'] = self.meta_param_recorder if 'print_configuration' in self.summary_labels: # Print to STDOUT (TODO: optimize output) self.meta_param_recorder.text_output(format_type=1) if network_spec is None: raise TensorForceError("No network_spec provided.") self.network_spec = network_spec self.discount = discount self.device = device self.session_config = session_config self.saver_spec = saver_spec self.distributed_spec = distributed_spec if optimizer is None: self.optimizer = dict( type='adam', learning_rate=1e-3 ) else: self.optimizer = optimizer self.variable_noise = variable_noise self.states_preprocessing_spec = states_preprocessing_spec self.explorations_spec = explorations_spec self.reward_preprocessing_spec = reward_preprocessing_spec self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization super(LearningAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, scope=scope )
def get_batch(self, batch_size, next_states=False): """ Samples a batch of the specified size according to priority. Args: batch_size: The batch size next_states: A boolean flag indicating whether 'next_states' values should be included Returns: A dict containing states, actions, rewards, terminals, internal states (and next states) """ if batch_size > len(self.observations): raise TensorForceError( "Requested batch size is larger than observations in memory: increase config.first_update." ) # Init empty states states = { name: np.zeros((batch_size, ) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items() } internals = [ np.zeros((batch_size, ) + shape, dtype) for shape, dtype in self.internals_spec ] actions = { name: np.zeros((batch_size, ) + tuple(action['shape']), dtype=util.np_dtype(action['type'])) for name, action in self.actions_spec.items() } terminal = np.zeros((batch_size, ), dtype=util.np_dtype('bool')) reward = np.zeros((batch_size, ), dtype=util.np_dtype('float')) if next_states: next_states = { name: np.zeros((batch_size, ) + tuple(state['shape']), dtype=util.np_dtype(state['type'])) for name, state in self.states_spec.items() } next_internals = [ np.zeros((batch_size, ) + shape, dtype) for shape, dtype in self.internals_spec ] # Start with unseen observations unseen_indices = list( xrange(self.none_priority_index + self.observations._capacity - 1, len(self.observations) + self.observations._capacity - 1)) self.batch_indices = unseen_indices[:batch_size] # Get remaining observations using weighted sampling remaining = batch_size - len(self.batch_indices) if remaining: samples = self.observations.sample_minibatch(remaining) sample_indices = [i for i, o in samples] self.batch_indices += sample_indices # Shuffle np.random.shuffle(self.batch_indices) # Collect observations for n, index in enumerate(self.batch_indices): observation, _ = self.observations._memory[index] for name, state in states.items(): state[n] = observation[0][name] for k, internal in enumerate(internals): internal[n] = observation[1][k] for name, action in actions.items(): action[n] = observation[2][name] terminal[n] = observation[3] reward[n] = observation[4] if next_states: for name, next_state in next_states.items(): next_state[n] = observation[5][name] for k, next_internal in enumerate(next_internals): next_internal[n] = observation[6][k] if next_states: return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals) else: return dict(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward)
def main(): parser = argparse.ArgumentParser() # N.b. if ran from within lab, the working directory is something like lab/bazel-out/../../tensorforce # Hence, relative paths will not work without first fetching the path of this run file parser.add_argument('-id', '--level-id', default='tests/demo_map',help="DeepMind Lab level id") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=1000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=200, help="Maximum number of timesteps per episode") parser.add_argument('-m', '--monitor', help="Save results to this directory") parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=True, help="Show debug outputs") # Redirect output to file sys.stdout = open('lab_output.txt', 'w') args = parser.parse_args() environment = DeepMindLab(args.level_id) path = os.path.dirname(__file__) if args.agent_config: # Use absolute path agent_config = json.load(path + args.agent_config) else: raise TensorForceError("No agent configuration provided.") if not args.network_spec: raise TensorForceError("No network configuration provided.") else: network_spec = json.load(path + args.network_config) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # configurable!!! agent = Agent.from_spec( spec=agent_config, kwargs=dict( states=environment.states, actions=environment.actions, network=network_spec ) ) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) runner = Runner( agent=agent, environment=environment, repeat_actions=1 ) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError("Could not load agent from {}: No such directory.".format(load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError("Cannot save agent to dir {} ()".format(save_dir)) report_episodes = args.episodes // 1000 def episode_finished(r): if r.episode % report_episodes == 0: sps = r.total_timesteps / (time.time() - r.start_time) logger.info("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}".format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format(np.mean(r.episode_rewards[-500:]))) logger.info("Average of last 100 rewards: {}".format(np.mean(r.episode_rewards[-100:]))) return True logger.info("Starting {agent} for Lab environment '{env}'".format(agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) runner.close() logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode + 1)) environment.close()
def tf_observe_timestep(self, states, internals, actions, terminal, reward): """ Args: states (): internals (): actions (): terminal (): reward (): Returns: """ # Store timestep in memory stored = self.memory.store( states=states, internals=internals, actions=actions, terminal=terminal, reward=reward ) # Periodic optimization with tf.control_dependencies(control_inputs=(stored,)): unit = self.update_mode['unit'] batch_size = self.update_mode['batch_size'] frequency = self.update_mode.get('frequency', batch_size) first_update = self.update_mode.get('first_update', 0) if unit == 'timesteps': # Timestep-based batch optimize = tf.logical_and( x=tf.equal(x=(self.timestep % frequency), y=0), y=tf.logical_and( x=tf.greater_equal(x=self.timestep, y=batch_size), y=tf.greater_equal(x=self.timestep, y=first_update) ) ) batch = self.memory.retrieve_timesteps(n=batch_size) elif unit == 'episodes': # Episode-based batch optimize = tf.logical_and( x=tf.equal(x=(self.episode % frequency), y=0), y=tf.logical_and( # Only update once per episode increment. x=tf.greater(x=tf.count_nonzero(input_tensor=terminal), y=0), y=tf.logical_and( x=tf.greater_equal(x=self.episode, y=batch_size), y=tf.greater_equal(x=self.episode, y=first_update) ) ) ) batch = self.memory.retrieve_episodes(n=batch_size) elif unit == 'sequences': # Timestep-sequence-based batch sequence_length = self.update_mode.get('length', 8) optimize = tf.logical_and( x=tf.equal(x=(self.timestep % frequency), y=0), y=tf.logical_and( x=tf.greater_equal(x=self.timestep, y=(batch_size + sequence_length - 1)), y=tf.greater_equal(x=self.timestep, y=first_update) ) ) batch = self.memory.retrieve_sequences(n=batch_size, sequence_length=sequence_length) else: raise TensorForceError("Invalid update unit: {}.".format(unit)) # Do not calculate gradients for memory-internal operations. batch = util.map_tensors( fn=(lambda tensor: tf.stop_gradient(input=tensor)), tensors=batch ) return tf.cond( pred=optimize, true_fn=(lambda: self.fn_optimization(**batch)), false_fn=tf.no_op )
def __init__(self, config, model=None): """Initializes the reinforcement learning agent. Args: config (Configuration): configuration object containing at least `states`, `actions`, `preprocessing` and 'exploration`. model (Model): optional model instance. If not supplied, a new model is created. """ assert self.__class__.name is not None and self.__class__.model is not None config.default(Agent.default_config) self.logger = logging.getLogger(__name__) self.logger.setLevel(util.log_levels[config.log_level]) # states config and preprocessing self.preprocessing = dict() if 'shape' in config.states: # only one state config.states = dict(state=config.states) self.unique_state = True if config.preprocessing is not None: config.preprocessing = dict(state=config.preprocessing) else: self.unique_state = False for name, state in config.states: state.default(dict(type='float')) if isinstance(state.shape, int): state.shape = (state.shape, ) if config.preprocessing is not None and name in config.preprocessing: preprocessing = Preprocessing.from_config( config=config.preprocessing[name]) self.preprocessing[name] = preprocessing state.shape = preprocessing.processed_shape(shape=state.shape) # actions config and exploration self.exploration = dict() if 'continuous' in config.actions: # only one action config.actions = dict(action=config.actions) if config.exploration is not None: config.exploration = dict(action=config.exploration) self.unique_action = True else: self.unique_action = False for name, action in config.actions: if action.continuous: action.default(dict(shape=(), min_value=None, max_value=None)) else: action.default(dict(shape=())) if isinstance(action.shape, int): action.shape = (action.shape, ) if config.exploration is not None and name in config.exploration: self.exploration[name] = Exploration.from_config( config=config.exploration[name]) self.states_config = config.states self.actions_config = config.actions if model is None: self.model = self.__class__.model(config) else: if not isinstance(model, self.__class__.model): raise TensorForceError( "Supplied model class `{}` does not match expected agent model class `{}`" .format( type(model).__name__, self.__class__.model.__name__)) self.model = model not_accessed = config.not_accessed() if not_accessed: self.logger.warning("Configuration values not accessed: {}".format( ', '.join(not_accessed))) self.episode = -1 self.timestep = 0 self.reset()
def main(): parser = argparse.ArgumentParser() parser.add_argument('gym_id', help="Id of the Gym environment") parser.add_argument('-a', '--agent', help="Agent configuration file") parser.add_argument('-n', '--network', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes") parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps") parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode") parser.add_argument('-d', '--deterministic', action='store_true', default=False, help="Choose actions deterministically") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('--monitor', help="Save results to this directory") parser.add_argument('--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results") parser.add_argument('--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") parser.add_argument( '--job', type=str, default=None, help="For distributed mode: The job type of this agent.") parser.add_argument( '--task', type=int, default=0, help="For distributed mode: The task index of this agent.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.setLevel(logging.INFO) environment = OpenAIGym(gym_id=args.gym_id, monitor=args.monitor, monitor_safe=args.monitor_safe, monitor_video=args.monitor_video) if args.agent is not None: with open(args.agent, 'r') as fp: agent = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network is not None: with open(args.network, 'r') as fp: network = json.load(fp=fp) else: network = None logger.info("No network configuration provided.") # TEST agent["execution"] = dict( type="distributed", distributed_spec=dict( job=args.job, task_index=args.task, # parameter_server=(args.job == "ps"), cluster_spec=dict(ps=["192.168.2.107:22222"], worker=["192.168.2.107:22223" ]))) if args.job else None # END: TEST agent = Agent.from_spec(spec=agent, kwargs=dict( states=environment.states, actions=environment.actions, network=network, )) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent) runner = Runner(agent=agent, environment=environment, repeat_actions=1) if args.debug: # TODO: Timestep-based reporting report_episodes = 1 else: report_episodes = 100 logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) def episode_finished(r, id_): if r.episode % report_episodes == 0: steps_per_second = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}" .format(r.agent.episode, r.episode_timestep, steps_per_second)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {:0.2f}".format( sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards)))) logger.info("Average of last 100 rewards: {:0.2f}".format( sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards)))) return True runner.run(num_timesteps=args.timesteps, num_episodes=args.episodes, max_episode_timesteps=args.max_episode_timesteps, deterministic=args.deterministic, episode_finished=episode_finished) runner.close() logger.info("Learning finished. Total episodes: {ep}".format( ep=runner.agent.episode))
def tf_apply(self, x, update): inputs_to_merge = list() for name in self.inputs: # Previous input, by name or "*", like normal network_spec # Not using named_tensors as there could be unintended outcome if name == "*" or name == "previous": inputs_to_merge.append(x) elif name in self.named_tensors: inputs_to_merge.append(self.named_tensors[name]) else: # Failed to find key in available inputs, print out help to user, raise error keys=list(self.named_tensors) raise TensorForceError( 'ComplexNetwork input "{}" doesn\'t exist, Available inputs: {}'.format(name,keys) ) # Review data for casting to more precise format so TensorFlow doesn't throw error for mixed data # Quick & Dirty cast only promote types: bool=0,int32=10, int64=20, float32=30, double=40 # cast_type_level = 0 cast_type_dict = {'bool':0, 'int32':10, 'int64':20, 'float32':30, 'float64':40} cast_type_func_dict = {0:tf.identity, 10:tf.to_int32, 20:tf.to_int64, 30:tf.to_float, 40:tf.to_double} # Scan inputs for max cast_type for tensor in inputs_to_merge: key=str(tensor.dtype.name) if key in cast_type_dict: if cast_type_dict[key] > cast_type_level: cast_type_level = cast_type_dict[key] else: raise TensorForceError('Network spec "input" doesn\'t support dtype {}'.format(keys) # Add casting if needed for index,tensor in enumerate(inputs_to_merge): key=str(tensor.dtype.name) if cast_type_dict[key] < cast_type_level: inputs_to_merge[index]=cast_type_func_dict[cast_type_level](tensor) input_tensor = tf.concat(inputs_to_merge,self.axis) return input_tensor class Output(Layer): """ Output layer. Used for ComplexLayerNetwork's to capture the tensor under and name for use with Input layers. Acts as a input to output passthrough. """ def __init__(self, output, scope='output', summary_labels=()): """ Output layer. Args: output: A string that names the tensor, will be added to available inputs """ self.output = output super(Output, self).__init__(scope=scope, summary_labels=summary_labels) def tf_apply(self, x, update): self.named_tensors[self.output]=x return x class ComplexLayeredNetwork(LayerBasedNetwork): """ Complex Network consisting of a sequence of layers, which can be created from a specification dict. """ def __init__(self, complex_layers_spec, scope='layered-network', summary_labels=()): """ Complex Layered network. Args: complex_layers_spec: List of layer specification dicts """ super(ComplexLayeredNetwork, self).__init__(scope=scope, summary_labels=summary_labels) self.complex_layers_spec = complex_layers_spec self.Inputs = dict() layer_counter = Counter() for branch_spec in self.complex_layers_spec: for layer_spec in branch_spec: if isinstance(layer_spec['type'], str): name = layer_spec['type'] else: name = 'layer' scope = name + str(layer_counter[name]) layer_counter[name] += 1 layer = Layer.from_spec( spec=layer_spec, kwargs=dict(scope=scope, summary_labels=summary_labels) ) # Link named dictionary reference into Layer layer.tf_tensors(named_tensors=self.Inputs) self.add_layer(layer=layer) def tf_apply(self, x, internals, update, return_internals=False): if isinstance(x, dict): self.Inputs.update(x) if len(x) == 1: x = next(iter(x.values())) internal_outputs = list() index = 0 for layer in self.layers: layer_internals = [internals[index + n] for n in range(layer.num_internals)] index += layer.num_internals x = layer.apply(x, update, *layer_internals) if not isinstance(x, tf.Tensor): internal_outputs.extend(x[1]) x = x[0] if return_internals: return x, internal_outputs else: return x @staticmethod def from_json(filename): # TODO: NOT TESTED """ Creates a complex_layered_network_builder from a JSON. Args: filename: Path to configuration Returns: A ComplexLayeredNetwork class with layers generated from the JSON """ path = os.path.join(os.getcwd(), filename) with open(path, 'r') as fp: config = json.load(fp=fp) return ComplexLayeredNetwork(layers_spec=config)
def create_tf_operations(self, config): """ Creates generic TensorFlow operations and placeholders required for models. Args: config: Model configuration which must contain entries for states and actions. Returns: """ self.action_taken = dict() self.internal_inputs = list() self.internal_outputs = list() self.internal_inits = list() # Placeholders with tf.variable_scope('placeholder'): # States self.state = dict() for name, state in config.states.items(): self.state[name] = tf.placeholder( dtype=util.tf_dtype(state.type), shape=(None, ) + tuple(state.shape), name=name) # Actions self.action = dict() self.discrete_actions = [] self.continuous_actions = [] for name, action in config.actions: if action.continuous: if not self.__class__.allows_continuous_actions: raise TensorForceError( "Error: Model does not support continuous actions." ) self.action[name] = tf.placeholder( dtype=util.tf_dtype('float'), shape=(None, ), name=name) else: if not self.__class__.allows_discrete_actions: raise TensorForceError( "Error: Model does not support discrete actions.") self.action[name] = tf.placeholder( dtype=util.tf_dtype('int'), shape=(None, ), name=name) # Reward & terminal self.reward = tf.placeholder(dtype=tf.float32, shape=(None, ), name='reward') self.terminal = tf.placeholder(dtype=tf.bool, shape=(None, ), name='terminal') # Deterministic action flag self.deterministic = tf.placeholder(dtype=tf.bool, shape=(), name='deterministic') # Optimizer if config.optimizer is not None: learning_rate = config.learning_rate with tf.variable_scope('optimization'): optimizer = util.function(config.optimizer, optimizers) args = config.optimizer_args or () kwargs = config.optimizer_kwargs or {} self.optimizer = optimizer(learning_rate, *args, **kwargs) else: self.optimizer = None
def __init__( self, states_spec, actions_spec, device=None, scope='constant', saver_spec=None, summary_spec=None, distributed_spec=None, discount=0.99, normalize_rewards=False, variable_noise=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, action_values=None ): """ Initializes a constant agent which returns a constant action of the provided shape. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. device: Device string specifying model device. scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. discount: Float specifying reward discount factor. normalize_rewards: Boolean flag specifying whether to normalize rewards, default False. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). preprocessing: Optional list of preprocessors (e.g. `image_resize`, `grayscale`) to apply to state. Each preprocessor is a dict containing a type and optional necessary arguments. exploration: Optional dict specifying exploration type (epsilon greedy strategies or Gaussian noise) and arguments. reward_preprocessing: Optional dict specifying reward preprocessor using same syntax as state preprocessing. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. action_values: Action value specification, must match actions_spec names """ if action_values is None: raise TensorForceError("No action_values for constant model provided.") self.optimizer = None self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.action_values = action_values super(ConstantAgent, self).__init__( states_spec, actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe )
def __init__( self, states_spec, actions_spec, network_spec, device=None, session_config=None, scope='trpo', saver_spec=None, summary_spec=None, distributed_spec=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, baseline_mode=None, baseline=None, baseline_optimizer=None, gae_lambda=None, batched_observe=1000, batch_size=1000, keep_last_timestep=True, likelihood_ratio_clipping=None, learning_rate=1e-3, cg_max_iterations=20, cg_damping=1e-3, cg_unroll_loop=False ): """ Creates a Trust Region Policy Optimization ([Schulman et al., 2015](https://arxiv.org/abs/1502.05477)) agent. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. discount: Float specifying reward discount factor. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). states_preprocessing_spec: Optional list of states preprocessors to apply to state (e.g. `image_resize`, `grayscale`). explorations_spec: Optional dict specifying action exploration type (epsilon greedy or Gaussian noise). reward_preprocessing_spec: Optional dict specifying reward preprocessing. distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. baseline_mode: String specifying baseline mode, `states` for a separate baseline per state, `network` for sharing parameters with the training network. baseline: Optional dict specifying baseline type (e.g. `mlp`, `cnn`), and its layer sizes. Consult examples/configs for full example configurations. baseline_optimizer: Optional dict specifying an optimizer and its parameters for the baseline following the same conventions as the main optimizer. gae_lambda: Optional float specifying lambda parameter for generalized advantage estimation. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: Int specifying number of samples collected via `observe` before an update is executed. keep_last_timestep: Boolean flag specifying whether last sample is kept, default True. likelihood_ratio_clipping: Optional clipping of likelihood ratio between old and new policy. learning_rate: Learning rate which may be interpreted differently according to optimizer, e.g. a natural gradient optimizer interprets the learning rate as the max kl-divergence between old and updated policy. cg_max_iterations: Int > 0 specifying conjugate gradient iterations, typically 10-20 are sufficient to find effective approximate solutions. cg_damping: Conjugate gradient damping value to increase numerical stability. cg_unroll_loop: Boolean indicating whether loop unrolling in TensorFlow is to be used which seems to impact performance negatively at this point, default False. """ if network_spec is None: raise TensorForceError("No network_spec provided.") self.optimizer = dict( type='optimized_step', optimizer=dict( type='natural_gradient', learning_rate=learning_rate, cg_max_iterations=cg_max_iterations, cg_damping=cg_damping, cg_unroll_loop=cg_unroll_loop, ), ls_max_iterations=10, ls_accept_ratio=0.9, ls_mode='exponential', ls_parameter=0.5, ls_unroll_loop=False ) self.network_spec = network_spec self.device = device self.session_config = session_config self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.variable_noise = variable_noise self.states_preprocessing_spec = states_preprocessing_spec self.explorations_spec = explorations_spec self.reward_preprocessing_spec = reward_preprocessing_spec self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.baseline_mode = baseline_mode self.baseline = baseline self.baseline_optimizer = baseline_optimizer self.gae_lambda = gae_lambda self.likelihood_ratio_clipping = likelihood_ratio_clipping super(TRPOAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, batch_size=batch_size, keep_last_timestep=keep_last_timestep )
def __init__( self, states_spec, actions_spec, network_spec, device=None, scope='dqn', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, double_q_model=False, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1 ): """ Creates a Deep-Q agent. Args: states_spec: actions_spec: network_spec: device: scope: saver_spec: summary_spec: distributed_spec: optimizer: discount: normalize_rewards: variable_noise: distributions_spec: entropy_regularization: target_sync_frequency: target_update_weight: double_q_model: huber_loss: preprocessing: exploration: reward_preprocessing: batched_observe: batch_size: memory: first_update: update_frequency: repeat_update: """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict( type='adam', learning_rate=1e-3 ) else: self.optimizer = optimizer if memory is None: memory = dict( type='replay', capacity=100000 ) else: self.memory = memory self.network_spec = network_spec self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.double_q_model = double_q_model self.huber_loss = huber_loss super(DQNAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update )
def tf_apply(self, x, update): if self.beta_learn: self.beta = tf.get_variable( name='beta', shape=(), dtype=tf.float32, initializer=tf.ones_initializer() ) if self.max is not None: x = tf.minimum(x=(self.beta * x), y=self.max) if self.min is not None: x = tf.maximum(x=(self.beta * x), y=self.min) if self.name == 'elu': x = tf.nn.elu(features=(self.beta * x)) elif self.name == 'none': x = tf.identity(input=(self.beta * x)) elif self.name == 'relu': x = tf.nn.relu(features=(self.beta * x)) if 'relu' in self.summary_labels: non_zero = tf.cast(x=tf.count_nonzero(input_tensor=x), dtype=tf.float32) size = tf.cast(x=tf.reduce_prod(input_tensor=tf.shape(input=x)), dtype=tf.float32) tf.contrib.summary.scalar(name='relu', tensor=(non_zero / size)) elif self.name == 'selu': # https://arxiv.org/pdf/1706.02515.pdf x = tf.nn.selu(features=(self.beta * x)) elif self.name == 'sigmoid': x = tf.sigmoid(x=(self.beta * x)) elif self.name == 'swish': # https://arxiv.org/abs/1710.05941 x = tf.sigmoid(x=(self.beta * x)) * x elif self.name == 'lrelu' or self.name == 'leaky_relu': if self.alpha is None: # Default alpha value for leaky_relu self.alpha = 0.2 x = tf.nn.leaky_relu(features=(self.beta * x), alpha=self.alpha) elif self.name == 'crelu': x = tf.nn.crelu(features=(self.beta * x)) elif self.name == 'softmax': x = tf.nn.softmax(logits=(self.beta * x)) elif self.name == 'softplus': x = tf.nn.softplus(features=(self.beta * x)) elif self.name == 'softsign': x = tf.nn.softsign(features=(self.beta * x)) elif self.name == 'tanh': x = tf.nn.tanh(x=(self.beta * x)) else: raise TensorForceError('Invalid non-linearity: {}'.format(self.name)) if 'beta' in self.summary_labels: tf.contrib.summary.scalar(name='beta', tensor=self.beta) return x
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', help="ID of the game mode") parser.add_argument('--hide', dest='hide', action='store_const', const=True, default=False, help="Hide output window") parser.add_argument('-a', '--agent-config', help="Agent configuration file") parser.add_argument('-n', '--network-spec', default=None, help="Network specification file") parser.add_argument('-e', '--episodes', type=int, default=50000, help="Number of episodes") parser.add_argument('-t', '--max-timesteps', type=int, default=2000, help="Maximum number of timesteps per episode") parser.add_argument('-s', '--save', help="Save agent to this dir") parser.add_argument('-se', '--save-episodes', type=int, default=100, help="Save agent every x episodes") parser.add_argument('-l', '--load', help="Load agent from this dir") parser.add_argument('-D', '--debug', action='store_true', default=False, help="Show debug outputs") args = parser.parse_args() logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) # configurable!!! environment = MazeExplorer(mode_id=args.mode, visible=not args.hide) if args.agent_config is not None: with open(args.agent_config, 'r') as fp: agent_config = json.load(fp=fp) else: raise TensorForceError("No agent configuration provided.") if args.network_spec is not None: with open(args.network_spec, 'r') as fp: network_spec = json.load(fp=fp) else: network_spec = None logger.info("No network configuration provided.") agent = Agent.from_spec(spec=agent_config, kwargs=dict(states_spec=environment.states, actions_spec=environment.actions, network_spec=network_spec)) if args.load: load_dir = os.path.dirname(args.load) if not os.path.isdir(load_dir): raise OSError( "Could not load agent from {}: No such directory.".format( load_dir)) agent.restore_model(args.load) if args.debug: logger.info("-" * 16) logger.info("Configuration:") logger.info(agent_config) if args.save: save_dir = os.path.dirname(args.save) if not os.path.isdir(save_dir): try: os.mkdir(save_dir, 0o755) except OSError: raise OSError( "Cannot save agent to dir {} ()".format(save_dir)) runner = Runner(agent=agent, environment=environment, repeat_actions=1) report_episodes = args.episodes // 1000 if args.debug: report_episodes = 1 def episode_finished(r): if r.episode % report_episodes == 0: sps = r.timestep / (time.time() - r.start_time) logger.info( "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}" .format(ep=r.episode, ts=r.timestep, sps=sps)) logger.info("Episode reward: {}".format(r.episode_rewards[-1])) logger.info("Average of last 500 rewards: {}".format( sum(r.episode_rewards[-500:]) / 500)) logger.info("Average of last 100 rewards: {}".format( sum(r.episode_rewards[-100:]) / 100)) return True logger.info("Starting {agent} for Environment '{env}'".format( agent=agent, env=environment)) runner.run(args.episodes, args.max_timesteps, episode_finished=episode_finished) runner.close() logger.info( "Learning finished. Total episodes: {ep}".format(ep=runner.episode)) environment.close()