def __init__(self, env, env_type, stochastic): """ The constructor that uses the environment to constuct the network build policy and then build the agent. Parameters ---------- env : gym.env The env the agent needs to interact with. env_type : str The type of env. stochastic : bool A bool describing if the behavior of the agent is stochastic (random in simple terms). """ ob_space = env.observation_space ac_space = env.action_space self.stochastic = stochastic #now find the correct build policy if env_type == 'atari': policy = build_policy(env, 'cnn') elif env_type == "ChessWrapper": policy = build_policy(env, 'mlp', {'num_layers':5}) else: policy = build_policy(env, 'mlp') #construct the agent model using the build model make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model()
def __init__(self, env, env_type, path, stochastic=False, gpu=True): from baselines.common.policies import build_policy from baselines.ppo2.model import Model self.graph = tf.Graph() if gpu: config = tf.ConfigProto() config.gpu_options.allow_growth = True else: config = tf.ConfigProto(device_count={'GPU': 0}) self.sess = tf.Session(graph=self.graph, config=config) with self.graph.as_default(): with self.sess.as_default(): if isinstance(env.observation_space, gym.spaces.Dict): ob_space = env.observation_space.spaces['ob_flattened'] else: ob_space = env.observation_space ac_space = env.action_space if env_type == 'atari': policy = build_policy(env, 'cnn') elif env_type in ['mujoco', 'robosuite']: policy = build_policy(env, 'mlp') else: assert False, ' not supported env_type' make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model() self.model_path = path self.model.load(path) if env_type in ['mujoco', 'robosuite']: with open(path + '.env_stat.pkl', 'rb') as f: import pickle s = pickle.load(f) self.ob_rms = s['ob_rms'] #self.ret_rms = s['ret_rms'] self.clipob = 10. self.epsilon = 1e-8 else: self.ob_rms = None self.stochastic = stochastic
def main(network, env, **network_kwargs): policy = build_policy(env, network, **network_kwargs) # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space nenvs = env.num_envs # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path)
def init_wrapper(environment, network_type, number_steps, entropy_coefficient, vf_coefficient, gradient_clipping, learning_rate, alpha, epsilon, total_timesteps, learning_rate_schedule='constant', **network_kwargs): policy = build_policy(environment, network_type, **network_kwargs) model = Model(policy=policy, env=environment, nsteps=number_steps, ent_coef=entropy_coefficient, vf_coef=vf_coefficient, max_grad_norm=gradient_clipping, lr=learning_rate, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=learning_rate_schedule) return {'policy': policy, 'model': model}
def test(config, env): ob_space = env.observation_space ac_space = env.action_space tf.reset_default_graph() gpu_opts = tf.GPUOptions(allow_growth=True) tf_config = tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_opts, ) with tf.Session(config=tf_config) as sess: policy = build_policy(env, 'cnn', estimate_q=True) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=config.number_of_environments, nsteps=config.number_of_steps, ent_coef=config.entropy_weight, q_coef=config.critic_weight, gamma=config.discount_factor, max_grad_norm=config.max_grad_norm, lr=config.learning_rate, rprop_alpha=config.rmsp_decay, rprop_epsilon=config.rmsp_epsilon, total_timesteps=config.timesteps, lrschedule='linear', c=config.clipping_factor, trust_region=True, alpha=config.momentum, delta=config.trust_region_delta) tf_util.load_variables(config.load_path, sess=sess) return make_rollouts(config, env, model)
def test_lstm_example(): import tensorflow as tf from baselines.common import policies, models, cmd_util from baselines.common.vec_env.dummy_vec_env import DummyVecEnv # create vectorized environment venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) with tf.Session() as sess: # build policy based on lstm network with 128 units policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) # initialize tensorflow variables sess.run(tf.global_variables_initializer()) # prepare environment variables ob = venv.reset() state = policy.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: action, _, state, _ = policy.step(ob, S=state, M=done) ob, reward, done, _ = venv.step(action) step_counter += 1 if done: break assert step_counter > 5
def test_lstm_example(): import tensorflow as tf from baselines.common import policies, models, cmd_util from baselines.common.vec_env.dummy_vec_env import DummyVecEnv # create vectorized environment venv = DummyVecEnv( [lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) with tf.Session() as sess: # build policy based on lstm network with 128 units policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) # initialize tensorflow variables sess.run(tf.global_variables_initializer()) # prepare environment variables ob = venv.reset() state = policy.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: action, _, state, _ = policy.step(ob, S=state, M=done) ob, reward, done, _ = venv.step(action) step_counter += 1 if done: break assert step_counter > 5
def test(config, env): ob_space = env.observation_space ac_space = env.action_space tf.reset_default_graph() gpu_opts = tf.GPUOptions(allow_growth=True) tf_config = tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_opts, ) with tf.Session(config=tf_config) as sess: nenvs = env.num_envs nbatch = nenvs * config.number_of_steps nbatch_train = nbatch // 4 policy = build_policy(env, 'cnn') model = Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=config.number_of_steps, ent_coef=config.entropy_weight, vf_coef=config.critic_weight, max_grad_norm=config.max_grad_norm, comm=None, mpi_rank_weight=1 ) model.load(config.load_path) return make_rollouts(config, env, model)
def test(config, env): ob_space = env.observation_space ac_space = env.action_space tf.reset_default_graph() gpu_opts = tf.GPUOptions(allow_growth=True) tf_config = tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_opts, ) with tf.Session(config=tf_config) as sess: config.batch_size = 2 config.number_of_steps = 2 policy = build_policy(env, 'cnn') model = Model(policy=policy, env=env, nsteps=config.number_of_steps, ent_coef=config.entropy_weight, vf_coef=config.critic_weight, max_grad_norm=config.max_grad_norm, lr=config.learning_rate, alpha=config.rmsp_decay, epsilon=config.discount_factor, total_timesteps=config.timesteps, lrschedule='linear') model.load(config.load_path) return make_rollouts(config, env, model)
def play(): env_args = dict() network_kwargs = dict(nlstm=512) # create vectorized environment pysc2_env_vec = SubprocVecEnv([partial(make_sc2env, id=i, **env_args) for i in range(1)]) policy = policies.build_policy(pysc2_env_vec, "cnn_lstm", **network_kwargs) nenvs = pysc2_env_vec.num_envs # Calculate the batch_size nsteps=256 nminibatches=1 nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches ent_coef=0.0 vf_coef=0.5 max_grad_norm=0.5 make_model = lambda : ppo_model(policy=policy, ob_space=(64, 64, 3), ac_space=65, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() model.load("2170_ppo_cnn_lstm_512_easy") ob = pysc2_env_vec.reset() state = model.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: #print(step_counter) action, _, state, _ = model.step(ob, S=state, M=done) ob, reward, done, _ = pysc2_env_vec.step(action) step_counter += 1
def save_lucid_model(config, params, *, model_path, metadata_path): config = config.copy() config.pop("num_envs") library = config.get("library", "baselines") venv = create_env(1, **config) arch = get_arch(**config) with tf.Graph().as_default(), tf.Session() as sess: observation_space = venv.observation_space observations_placeholder = tf.placeholder(shape=(None, ) + observation_space.shape, dtype=tf.float32) if library == "baselines": from baselines.common.policies import build_policy with tf.variable_scope("ppo2_model", reuse=tf.AUTO_REUSE): policy_fn = build_policy(venv, arch) policy = policy_fn( nbatch=None, nsteps=1, sess=sess, observ_placeholder=(observations_placeholder * 255), ) pd = policy.pd vf = policy.vf else: raise ValueError(f"Unsupported library: {library}") load_params(params, sess=sess) Model.save( model_path, input_name=observations_placeholder.op.name, output_names=[pd.logits.op.name, vf.op.name], image_shape=observation_space.shape, image_value_range=[0.0, 1.0], ) metadata = { "policy_logits_name": pd.logits.op.name, "value_function_name": vf.op.name, "env_name": config.get("env_name"), "gae_gamma": config.get("gamma"), "gae_lambda": config.get("lambda"), } env = venv while hasattr(env, "env") and (not hasattr(env, "combos")): env = env.env if hasattr(env, "combos"): metadata["action_combos"] = env.combos else: metadata["action_combos"] = None save_joblib(metadata, metadata_path) return { "model_bytes": read(model_path, cache=False, mode="rb"), **metadata }
def create_policy(env, network, value_network='copy', **network_kwargs): policy_fn = build_policy(env, network, value_network, **network_kwargs) ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) return policy_fn(observ_placeholder=ob)
def create_model(network, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): set_global_seeds(seed) env = HaliteEnv() # if not isinstance(env, VecFrameStack): # env = VecFrameStack(env, env.nstack) # network = 'halite_net' # not yet, for now let's prented the halite layer is the only input policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack model_params = { "policy": policy, "ob_space": ob_space, "ac_space": ac_space, "nenvs": nenvs, "nsteps": nsteps, "ent_coef": ent_coef, "q_coef": q_coef, "gamma": gamma, "max_grad_norm": max_grad_norm, "lr": lr, "rprop_alpha": rprop_alpha, "rprop_epsilon": rprop_epsilon, "total_timesteps": total_timesteps, "lrschedule": lrschedule, "c": c, "trust_region": trust_region, "alpha": alpha, "delta": delta } model = Model(**model_params) return env, policy, nenvs, ob_space, ac_space, nstack, model
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", np.nan if len(rewards) == 0 else np.mean(rewards)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn( network, env, save_path, seed=None, nsteps=10, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, # log_interval=100, log_interval=10, load_path=None, **network_kwargs): if network == 'cnn': network_kwargs['one_dim_bias'] = True set_global_seeds(seed) assert save_path is not None # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) # Calculate the batch_size,这里将nsteps设为1 nbatch = nenvs*nsteps observation = [] action = [] for update in range(1, total_timesteps//nbatch+1): # Get mini batch of experiences obs, states, rewards, masks, actions, values,output = runner.run() observation.append(obs) print('times', update) obs = np.concatenate(observation) #compute fisher matrix FM = model.compute_fisher(obs,plot_diffs=True,disp_freq=10) # FM = model.compute_exact_fisher(obs,plot_diffs=True,disp_freq=10) joblib.dump(FM, save_path)
def main(): def make_env(): obs_type = retro.Observations.IMAGE # retro.Observations.RAM env = retro.make(game='Pitfall-Atari2600', state=retro.State.DEFAULT, scenario='scenario', record='.', players=1, obs_type=obs_type) env = wrap_deepmind_retro(env) return env base_dirname = os.path.join(currentdir, "results") #dir_name = "pitfall_ppo2_rl_baseline1" dir_name = "pitfall_ppo2testing_D191211_073544" dir_name = os.path.join(base_dirname, dir_name) load_path = os.path.join(dir_name, 'models/00781') venv = SubprocVecEnv([make_env] * 1) #Vectorized network = 'cnn' policy = build_policy(venv, network) nenvs = venv.num_envs # Get the nb of env # Get state_space and action_space ob_space = venv.observation_space ac_space = venv.action_space # Instantiate the model object model_fn = Model nsteps = 2048 nbatch = nenvs * nsteps nminibatches = 4 nbatch_train = nbatch // nminibatches model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=2048, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5) model.load(load_path) # Instantiate the runner object runner = Runner(env=venv, model=model, nsteps=nsteps, gamma=0.99, lam=0.95) # run the Runner and record video total_timesteps = int(1e4) nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): print("progress: ", update, "/", nupdates) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( )
def main(): numOfTests = 40 env_args = { 'episode_life': False, 'clip_rewards': False, 'crop': True, 'rotate': True } env = VecFrameStack( make_vec_env("gvgai-zelda-lvl0-v0", numOfTests, 43, wrapper_kwargs=env_args), 4) policy = build_policy(env, "cnn") model = Model(policy=policy, env=env, nsteps=5) model.load('logs/test_4*5_r1_right/checkpoints/260000') nh, nw, nc = env.observation_space.shape result = dict() for j in range(201, 601): # obs = np.zeros((numOfTests, nh, nw, nc), dtype=np.uint8) done = np.array([False] * numOfTests) env.venv.set_level( "GVGAI_GYM/gym_gvgai/envs/games/zelda_v0/zelda_lvl{}.txt".format( j)) obs = env.reset() infos = [False] * numOfTests # dones = [False] * numOfTests while not all(done): actions, values, state, _ = model.step(obs) obs, rewards, dones, info = env.step(actions) done[np.where(dones != False)] = True for i in np.where(dones != False)[0].tolist(): if not infos[i]: # print(info) del info[i]["grid"] del info[i]["ascii"] infos[i] = info[i] # print(np.where(dones!=False)[0]) # print(done) # print(infos) # print(dones) win = [1 if (i['winner'] == 'PLAYER_WINS') else 0 for i in infos] # score = [i['episode']['r'] for i in infos] # steps = [i['episode']['l'] for i in infos] # time = [i['episode']['t'] for i in infos] print("level {}".format(j), win) result[j] = infos env.close() with open("result_4*5_r1_right_200~600", "wb") as f: pickle.dump(result, f)
def __init__(self, env, env_type, stochastic=False): ob_space = env.observation_space ac_space = env.action_space if env_type == 'atari': policy = build_policy(env, 'cnn') elif env_type == 'mujoco': policy = build_policy(env, 'mlp') make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model() self.stochastic = stochastic
def make_leg_model(leg, env): leg_env = gym.make('PhantomXLeg-v0') leg_env.set_info(env.info) leg_env.leg_name = leg policy = build_policy(leg_env, defaults['network'], **alg_kwargs) model = ppo2.Model(policy=policy, ob_space=leg_env.observation_space, ac_space=leg_env.action_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=defaults['nsteps'], ent_coef=defaults['ent_coef'], vf_coef=defaults['vf_coef'], max_grad_norm=defaults['max_grad_norm']) model.load('' + leg + '/checkpoints/05000') return model
def get_step_fn(config, params, *, num_envs, full_resolution): config = config.copy() config.pop("num_envs") library = config.get("library", "baselines") venv = create_env(num_envs, **config) arch = get_arch(**config) with tf.Graph().as_default(), tf.Session() as sess: if library == "baselines": from baselines.common.policies import build_policy with tf.variable_scope("ppo2_model", reuse=tf.AUTO_REUSE): policy_fn = build_policy(venv, arch) policy = policy_fn(nbatch=venv.num_envs, nsteps=1, sess=sess) stepdata = { "ob": venv.reset(), "state": policy.initial_state, "first": np.ones((venv.num_envs, ), bool), } if full_resolution: stepdata["ob_full"] = np.stack( [info["rgb"] for info in venv.env.get_info()], axis=0) def step_fn(): result = { "ob": stepdata["ob"], "first": stepdata["first"].astype(bool) } if full_resolution: result["ob_full"] = stepdata["ob_full"] result["ac"], _, stepdata["state"], _ = policy.step( stepdata["ob"], S=stepdata["state"], M=stepdata["first"].astype(float), ) ( stepdata["ob"], result["reward"], stepdata["first"], result["info"], ) = venv.step(result["ac"]) if full_resolution: stepdata["ob_full"] = np.stack( [info["rgb"] for info in result["info"]], axis=0) return result else: raise ValueError(f"Unsupported library: {library}") load_params(params, sess=sess) yield step_fn
def train(args,extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) #workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #set_global_seeds(workerseed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args,normalize_ob=False,normalize_ret=False) if args.save_video_interval != 0: env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) #timesteps_per_batch=1024 #timesteps_per_batch=2048 beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy(env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1,) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs)) model=learn(env=env, seed=seed, beta=beta, total_timesteps=total_timesteps, **alg_kwargs) return model, env
def __init__(self, checkpoint_path): player_base.PlayerBase.__init__(self) self._action_set = 'default' self._player_prefix = 'player_0' config = tf.ConfigProto() config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) with tf.variable_scope(self._player_prefix): with tf.variable_scope('ppo2_model'): policy_fn = build_policy(DummyEnv(self._action_set), 'mlp', num_layers=5, num_hidden=128) self._policy = policy_fn(nbatch=1, sess=self._sess) _load_variables(checkpoint_path, self._sess, prefix=self._player_prefix + '/') saver = tf.train.Saver() saver.save(self._sess, "/home/alex/Dropbox/projects/python/kaggle/football/saved_models/simple_ppo2/simple_ppo2")
def learn( *, network, env, total_timesteps, seed=None, **network_kwargs, ): # setup runable policy policy = build_policy(env, network, value_network='copy', **network_kwargs) ob_space = env.observation_space ac_space = env.action_space # initialize the gradient descent policy directly return GradientDescent(ob_space, ac_space)
def test(env, load_path, img_path, display_steps=500): with tf.Session() as sess: policy = build_policy(env, a2c_discrete_cnn) with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): model = policy(1, 1, sess) tf_util.load_variables(load_path, sess=sess) def display_actor(obs): actions = model.step([obs])[0] return actions[0] if img_path is None: show(env, display_actor, display_steps) else: save_images(env, display_actor, display_steps, img_path, 'img_')
def __init__(self, player_config, env_config): player_base.PlayerBase.__init__(self, player_config) self._action_set = 'default' config = tf.ConfigProto() config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) self._player_prefix = 'player_{}'.format(player_config['index']) stacking = 4 if player_config.get('stacked', True) else 1 policy = player_config.get('policy', 'cnn') self._stacker = ObservationStacker(stacking) with tf.variable_scope(self._player_prefix): with tf.variable_scope('ppo2_model'): policy_fn = build_policy(DummyEnv(self._action_set, stacking), policy) self._policy = policy_fn(nbatch=1, sess=self._sess) _load_variables(player_config['checkpoint'], self._sess, prefix=self._player_prefix + '/')
def demonstrate(network, env, nsteps, mvs, load_path, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, mpi_rank_weight=1, comm=None, gamma=0.99, lam=0.95): policy = build_policy(env, network) model = Model(policy=policy, nbatch_act=1, nbatch_train=None, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) print('Model has been successfully loaded from {0}'.format(load_path)) else: print( 'No model has been loaded. Neural network with random weights is used.' ) # Instantiate the runner object and episode buffer runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, mvs=mvs) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( render=True) print('Demo completed! Reward: {0}'.format(epinfos[0]['r'])) print('\nPress Ctrl+C to stop the demo...')
def prelearn(network, env, trainX, trainY, testX, testY, seed=None, lr=3e-4): set_global_seeds(seed) policy = build_policy(env, network) # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Instantiate the model object (that creates act_model and train_model) from baselines.ppo2.model import Model model_fn = Model batch_size = 128 ndata = len(trainX) nepochs = 10 # Set up model with some dummy arguments model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=batch_size, nsteps=8192, ent_coef=0.00, vf_coef=0.03, max_grad_norm=0.5) # train for _ in range(nepochs): for start in range(0, ndata, batch_size): end = start + batch_size obs = trainX[start:end] actions = trainY[start:end] model.pretrain(obs, actions, lr) # validate with MSE pred_actions = [] for o in testX: pred_actions.append(model.evaluate(o)) sse = 0 for pred_action, action in zip(pred_actions, testY): sse += (action[0] - float(pred_action[0]))**2 mse = sse / len(pred_actions) print(type(mse)) print("Validation loss (mse): " + str(mse)) logdir = logger.get_dir() model.save(osp.join(logdir, 'pretrained_model.pkl'))
def runner(leg, env): leg_env = gym.make('PhantomXLeg-v0') leg_env.set_info(env.info) leg_env.leg_name = leg policy = build_policy(leg_env, defaults['network'], **alg_kwargs) model = ppo2.Model(policy=policy, ob_space=leg_env.observation_space, ac_space=leg_env.action_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=defaults['nsteps'], ent_coef=defaults['ent_coef'], vf_coef=defaults['vf_coef'], max_grad_norm=defaults['max_grad_norm']) model.load('' + leg + '/checkpoints/05000') obs = leg_env.reset() ep_reward = 0 rewards = [] episode = 0 step = 0 while True: step += 1 action, value_estimate, next_state, neglogp = model.step(obs) obs, reward, done, _ = leg_env.step(action[0]) ep_reward += reward if done: leg_env.reset() episode += 1 print(step) print(ep_reward) rewards.append(ep_reward) step = 0 ep_reward = 0 if episode >= 100: break f = open(filename, "w+") f.write("Variance: " + str(np.var(rewards))) rewards = np.array(rewards, dtype=float) f.write(",Median: " + str(statistics.median(rewards))) f.write(",Mean: " + str(np.mean(rewards))) f.close() while True: time.sleep(2) print("DONE")
def train(env_id, seed, policy, load_path, num_episodes, frame_skip, no_render): env = make_neyboy_env(env_id, 1, seed, allow_early_resets=True, frame_skip=frame_skip, save_video=True) env = VecFrameStack(env, 4) policy = build_policy(env, policy) ob_space = env.observation_space ac_space = env.action_space ent_coef = .01 vf_coef = 0.5 max_grad_norm = 0.5 model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=env.num_envs, nbatch_train=0, nsteps=0, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model.load(load_path) for _ in range(num_episodes): if not no_render: env.render() observation, done = env.reset(), False if not no_render: env.render() episode_rew = 0 score = 0 while not done: if not no_render: env.render() action, _, _, _ = model.step(observation) observation, reward, done, info = env.step(action) episode_rew += reward score = info[0] print('Episode reward={}, info={}'.format(episode_rew, score))
def load_model(venv, load_path, network, **network_kwargs): policy = build_policy(venv, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=venv, nsteps=0, ent_coef=0, vf_coef=0, max_grad_norm=0, lr=0, alpha=0, epsilon=0, total_timesteps=0, lrschedule='linear') model.load(load_path) return model
def __init__(self, player_config, env_config): player_base.PlayerBase.__init__(self, player_config) self._action_set = (env_config['action_set'] if 'action_set' in env_config else 'default') config = tf.ConfigProto() config.gpu_options.allow_growth = True self._sess = tf.Session(config=config) self._player_prefix = 'player_{}'.format(player_config['index']) with tf.variable_scope(self._player_prefix): with tf.variable_scope('ppo2_model'): policy_fn = build_policy(DummyEnv(self._action_set), 'mlp', num_layers=5, num_hidden=128) self._policy = policy_fn(nbatch=1, sess=self._sess) _load_variables(player_config['checkpoint'], self._sess, prefix=self._player_prefix + '/')
def learn( network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs*nsteps # Start total timer tstart = time.time() for update in range(1, total_timesteps//nbatch+1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) nseconds = time.time()-tstart # Calculate the fps (frame per second) fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) return model
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, load_path=None, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network architecture on a given environment using ACER. Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' print("Running Acer Simple") print(locals()) set_global_seeds(seed) if not isinstance(env, VecFrameStack): env = VecFrameStack(env, 1) policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta) runner = Runner(env=env, model=model, nsteps=nsteps) if replay_ratio > 0: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size) else: buffer = None nbatch = nenvs*nsteps acer = Acer(runner, model, buffer, log_interval) acer.tstart = time.time() for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): acer.call(on_policy=False) # no simulation steps in this return model
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model