def test(config, env): ob_space = env.observation_space ac_space = env.action_space tf.reset_default_graph() gpu_opts = tf.GPUOptions(allow_growth=True) tf_config = tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_opts, ) with tf.Session(config=tf_config) as sess: nenvs = env.num_envs nbatch = nenvs * config.number_of_steps nbatch_train = nbatch // 4 policy = build_policy(env, 'cnn') model = Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=config.number_of_steps, ent_coef=config.entropy_weight, vf_coef=config.critic_weight, max_grad_norm=config.max_grad_norm, comm=None, mpi_rank_weight=1 ) model.load(config.load_path) return make_rollouts(config, env, model)
def __init__(self, env, env_type, stochastic): """ The constructor that uses the environment to constuct the network build policy and then build the agent. Parameters ---------- env : gym.env The env the agent needs to interact with. env_type : str The type of env. stochastic : bool A bool describing if the behavior of the agent is stochastic (random in simple terms). """ ob_space = env.observation_space ac_space = env.action_space self.stochastic = stochastic #now find the correct build policy if env_type == 'atari': policy = build_policy(env, 'cnn') elif env_type == "ChessWrapper": policy = build_policy(env, 'mlp', {'num_layers':5}) else: policy = build_policy(env, 'mlp') #construct the agent model using the build model make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model()
def demonstrate(network, env, nsteps, mvs, load_path, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, mpi_rank_weight=1, comm=None, gamma=0.99, lam=0.95): policy = build_policy(env, network) model = Model(policy=policy, nbatch_act=1, nbatch_train=None, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) print('Model has been successfully loaded from {0}'.format(load_path)) else: print( 'No model has been loaded. Neural network with random weights is used.' ) # Instantiate the runner object and episode buffer runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, mvs=mvs) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( render=True) print('Demo completed! Reward: {0}'.format(epinfos[0]['r'])) print('\nPress Ctrl+C to stop the demo...')
def __init__(self, env, env_type, path, stochastic=False, gpu=True): from baselines.common.policies import build_policy from baselines.ppo2.model import Model self.graph = tf.Graph() if gpu: config = tf.ConfigProto() config.gpu_options.allow_growth = True else: config = tf.ConfigProto(device_count={'GPU': 0}) self.sess = tf.Session(graph=self.graph, config=config) with self.graph.as_default(): with self.sess.as_default(): if isinstance(env.observation_space, gym.spaces.Dict): ob_space = env.observation_space.spaces['ob_flattened'] else: ob_space = env.observation_space ac_space = env.action_space if env_type == 'atari': policy = build_policy(env, 'cnn') elif env_type in ['mujoco', 'robosuite']: policy = build_policy(env, 'mlp') else: assert False, ' not supported env_type' make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model() self.model_path = path self.model.load(path) if env_type in ['mujoco', 'robosuite']: with open(path + '.env_stat.pkl', 'rb') as f: import pickle s = pickle.load(f) self.ob_rms = s['ob_rms'] #self.ret_rms = s['ret_rms'] self.clipob = 10. self.epsilon = 1e-8 else: self.ob_rms = None self.stochastic = stochastic
def __init__(self, env, env_type, stochastic=False): ob_space = env.observation_space ac_space = env.action_space if env_type == 'atari': policy = build_policy(env, 'cnn') elif env_type == 'mujoco': policy = build_policy(env, 'mlp') make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model() self.stochastic = stochastic
def __init__(self, env, env_type, path, stochastic=False, gpu=True): from baselines.common.policies import build_policy from baselines.ppo2.model import Model self.graph = tf.Graph() if gpu: config = tf.ConfigProto() config.gpu_options.allow_growth = True else: config = tf.ConfigProto(device_count={'GPU': 0}) self.sess = tf.Session(graph=self.graph, config=config) with self.graph.as_default(): with self.sess.as_default(): ob_space = env.observation_space ac_space = env.action_space if env_type == 'atari': policy = build_policy(env, 'cnn') else: assert False, ' not supported env_type' make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=1, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model() self.model_path = path self.model.load(path) self.stochastic = stochastic
def main7(): retro.data.add_custom_integration("custom") def wrap_deepmind_n64(env, reward_scale=1 / 100.0, frame_stack=1, grayscale=False): env = MaxAndSkipEnv(env, skip=4) env = WarpFrame(env, width=150, height=100, grayscale=grayscale) env = FrameStack(env, frame_stack) env = ScaledFloatFrame(env) env = RewardScaler(env, scale=1 / 100.0) return env def make_env(): retro.data.add_custom_integration("custom") env = retro.n64_env.N64Env(game="SuperSmashBros-N64", use_restricted_actions=retro.Actions.MULTI_DISCRETE, inttype=retro.data.Integrations.CUSTOM, obs_type=retro.Observations.IMAGE) env = wrap_deepmind_n64(env) return env gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) nenvs = 2 # env = DummyVecEnv([make_env] * nenvs) env = SubprocVecEnv([make_env] * nenvs) network_name = "impala_cnn_lstm" policy = build_policy(env, network_name) recurrent = "lstm" in network_name ob_space = env.observation_space ac_space = env.action_space nsteps = 10 nminibatches = 2 nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, comm=None, mpi_rank_weight=1) runner = Runner(env=env, model=model, nsteps=10, gamma=.99, lam=.95) env.reset() num_steps = 20000 action = [np.array([0, 0, 0]), np.array([0, 0, 0])] for i in range(num_steps): sys.stdout.write(f"\r{i+1} / {num_steps}") action = [env.action_space.sample() for _ in range(nenvs)] obs, reward, dones, info = env.step(action) # env.reset(dones) # env.render() if i % 50 == 0: if recurrent: fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 12)) else: fig, axs = plt.subplots(nrows=4, ncols=2, figsize=(20, 12)) for env_index in range(nenvs): if recurrent: axs[env_index].imshow(obs[env_index, :, :, :]) else: for j in range(4): row = env_index * 2 + j // 2 col = j % 2 print(row) print(col) axs[row, col].imshow(obs[env_index, :, :, j]) plt.show() plt.close() end = time.time() print(end - start) return env
def __init__(self, env, env_type, nenv=4, batch_size=64, gpu=True): from baselines.common.policies import build_policy from baselines.ppo2.model import Model self.graph = tf.Graph() if gpu: config = tf.ConfigProto() config.gpu_options.allow_growth = True else: config = tf.ConfigProto(device_count={'GPU': 0}) self.sess = tf.Session(graph=self.graph, config=config) with self.graph.as_default(): with self.sess.as_default(): ob_space = env.observation_space ac_space = env.action_space if env_type == 'atari': policy = build_policy(env, 'cnn') target_action = tf.placeholder(tf.int32, [batch_size]) elif env_type == 'mujoco': policy = build_policy(env, 'mlp') target_action = tf.placeholder( tf.float32, [batch_size, ac_space.shape[0]]) else: assert False, ' not supported env_type' make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenv, nbatch_train=batch_size, nsteps=1, ent_coef=0., vf_coef=0., max_grad_norm=0.) self.model = make_model() self.inp = self.model.train_model.X # This is also placeholder self.target_action = target_action self.ac_logits = self.model.train_model.pi if env_type == 'atari': loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.ac_logits, labels=self.target_action) elif env_type == 'mujoco': loss = tf.reduce_sum((self.ac_logits - self.l)**2, axis=1) self.loss = tf.reduce_mean(loss, axis=0) policy_optim = tf.train.AdamOptimizer(1e-4) policy_params = tf.trainable_variables('ppo2_model/pi') self.update_op = policy_optim.minimize(self.loss, var_list=policy_params) # Value Fn Optimization self.R = R = tf.placeholder(tf.float32, [None]) self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) vpred = self.model.train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value( vpred - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) self.vf_loss = .5 * tf.reduce_mean( tf.maximum(vf_losses1, vf_losses2)) value_optim = tf.train.AdamOptimizer(1e-4) value_params = tf.trainable_variables('ppo2_model/vf') self.value_update_op = value_optim.minimize( self.vf_loss, var_list=value_params) ################ Miscellaneous self.init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.sess.run(self.init_op)
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 total_timesteps = 1_000_000 ## now this counts steps in testing runs use_vf_clipping = True ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 L2_WEIGHT = 10e-4 FM_COEFF = 0.002 REAL_THRES = 0.1 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) ## default starting_level set to 50 to test on unseen levels! parser.add_argument('--start_level', type=int, default=1000) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) args = parser.parse_args() args.total_timesteps = total_timesteps if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_rank_weight = 0 num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") ## Modified based on random_ppo.learn env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) policy = build_policy(env, network) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id) model.load(LOAD_PATH) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) # tfirststart = time.time() ## Not doing timing yet # active_ep_buf = epinfobuf100 mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) ## differnent from random_ppo! epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
from baselines.common.cmd_util import make_vec_env frame_stack_size = 4 env = make_vec_env('AssaultNoFrameskip-v0', 'atari', 1, 0) env = VecFrameStack(env, frame_stack_size) ob_space = env.observation_space ac_space = env.action_space network_type = 'cnn' policy_network_fn = get_network_builder(network_type)() network = policy_network_fn(ob_space.shape) model = Model(ac_space=ac_space, policy_network=network, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5) ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, '../models/PPO22', max_to_keep=None) ckpt.restore(manager.latest_checkpoint) obs = env.reset() state = model.initial_state episode_reward = 0 while True: if state is not None: actions, _, state, _ = model.step(obs)
stacked=True, include_rendering=args.render) env.observation_space = Box(low=0, high=255, shape=(72, 96, 16)) policy = build_policy(env=env, policy_network='cnn') # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Instantiate the model object (that creates act_model and train_model) model = Model(policy=policy, ob_space=env.observation_space, ac_space=Discrete(19), nbatch_act=1, nbatch_train=16, nsteps=128, ent_coef=0.01, vf_coef=0.05, max_grad_norm=0.5, comm=None, mpi_rank_weight=1) model.load(args.checkpoint) # model = PPO2.load(args.checkpoint) # player = get_player(args=args) for _ in range(args.how_many): obs = env.reset() cnt = 1 dones = [False] states = model.initial_state done = False
def eval(args): logdir = str(Path(args.logbase_path) / args.env_id) env = gym.make(args.env_id) valid_agents = [] models = sorted(Path(args.learners_path).glob('?????')) for path in models: if path.name > args.max_chkpt: continue agent = PPO2Agent(env, args.env_type, str(path), stochastic=args.stochastic) valid_agents.append(agent) test_agents = [] for i, path in enumerate(models): if i % 10 == 0: agent = PPO2Agent(env, args.env_type, str(path), stochastic=args.stochastic) test_agents.append(agent) gt_dataset = GTDataset(env) gt_dataset.prebuilt(valid_agents, -1) gt_dataset_test = GTDataset(env) gt_dataset_test.prebuilt(test_agents, -1) models = [] for i in range(args.num_models): with tf.variable_scope('model_%d' % i): models.append( Model(args.include_action, env.observation_space.shape[0], env.action_space.shape[0], steps=args.steps)) ### Initialize Parameters init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Training configuration config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession() sess.run(init_op) for i, model in enumerate(models): model.saver.restore(sess, logdir + '/model_%d.ckpt' % (i)) print('model %d' % i) obs, acs, r = gt_dataset.trajs r_hat = model.get_reward(obs, acs) obs, acs, r_test = gt_dataset_test.trajs r_hat_test = model.get_reward(obs, acs) fig, axes = plt.subplots(1, 2) axes[0].plot(r, r_hat, 'o') axes[1].plot(r_test, r_hat_test, 'o') fig.savefig('model_%d.png' % i) imgcat(fig) plt.close(fig) np.savez('model_%d.npz' % i, r=r, r_hat=r_hat, r_test=r_test, r_hat_test=r_hat_test)
def train(args): logdir = Path(args.log_dir) if logdir.exists(): c = input( 'log dir is already exist. continue to train a preference model? [Y/etc]? ' ) if c in ['YES', 'yes', 'Y']: import shutil shutil.rmtree(str(logdir)) else: print('good bye') return logdir.mkdir(parents=True) with open(str(logdir / 'args.txt'), 'w') as f: f.write(str(args)) logdir = str(logdir) env = gym.make(args.env_id) train_agents = [RandomAgent(env.action_space)] if args.random_agent else [] models = sorted([ p for p in Path(args.learners_path).glob('?????') if int(p.name) <= args.max_chkpt ]) for path in models: agent = PPO2Agent(env, args.env_type, str(path), stochastic=args.stochastic) train_agents.append(agent) if args.preference_type == 'gt': dataset = GTDataset(env) elif args.preference_type == 'gt_traj': dataset = GTTrajLevelDataset(env) elif args.preference_type == 'gt_traj_no_steps': dataset = GTTrajLevelNoStepsDataset(env, args.max_steps) elif args.preference_type == 'gt_traj_no_steps_noise': dataset = GTTrajLevelNoSteps_Noise_Dataset(env, args.max_steps, args.traj_noise) elif args.preference_type == 'gt_traj_no_steps_n_mix': dataset = GTTrajLevelNoSteps_N_Mix_Dataset(env, args.N, args.max_steps) elif args.preference_type == 'time': dataset = LearnerDataset(env, args.min_margin) else: assert False, 'specify prefernce type' dataset.prebuilt(train_agents, args.min_length) models = [] for i in range(args.num_models): with tf.variable_scope('model_%d' % i): models.append( Model(args.include_action, env.observation_space.shape[0], env.action_space.shape[0], steps=args.steps, num_layers=args.num_layers, embedding_dims=args.embedding_dims)) ### Initialize Parameters init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Training configuration config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.InteractiveSession() sess.run(init_op) for i, model in enumerate(models): D = dataset.sample(args.D, args.steps, include_action=args.include_action) if D is None: model.train_with_dataset(dataset, 64, include_action=args.include_action, debug=True) else: model.train(D, l2_reg=args.l2_reg, noise_level=args.noise, debug=True) model.saver.save(sess, logdir + '/model_%d.ckpt' % (i), write_meta_graph=False)
def learn(env, nenvs, network, password, total_timesteps=1e6, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, save_path=None, load_path=None, **network_kwargs): set_global_seeds(seed) save_dir = save_path if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, value_network='copy', **network_kwargs) # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = ProcessRunner(env=env, model=model, n_env=nenvs, n_steps=nsteps, gamma=gamma, lam=lam, password=password, verbose=0, **network_kwargs) epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.time() nupdates = total_timesteps // nbatch if save_interval is None: save_interval = nupdates // 5 for update in range(1, nupdates + 1): logger.log("# " + "=" * 78) logger.log("# Iteration %i / %i" % (update, nupdates)) logger.log("# " + "=" * 78) assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch policy_param = get_session().run( tf.trainable_variables('ppo2_model/pi')) valfn_param = get_session().run( tf.trainable_variables('ppo2_model/vf')) obs, rewards, returns, masks, actions, values, neglogpacs, action_mean, states, epinfos, dataset_total_rew = runner.run( policy_param, valfn_param) #pylint: disable=E0632 ## !! TEST !! # with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # da_, v_, nglp_, mean_, std_, logstd_ = policy().step_debug(obs, actions) # if not ((np.isclose(da_, action_mean, atol=5e-7)).all()): # print(da_ - action_mean) # print("action no match") # if not ((np.isclose(v_, values, atol=5e-7)).all()): # print(v_ - values) # print("value no match") # if not ((np.isclose(nglp_, neglogpacs, atol=5e-7)).all()): # print(nglp_-neglogpacs) # print("neglogp no match") # __import__('ipdb').set_trace() # print("Debugging!") ## !! TEST !! epinfobuf.extend(epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('dataset_rew', dataset_total_rew / nenvs) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('serial_num_dones', int(masks.sum() / nenvs)) logger.logkv('total_num_dones', masks.sum()) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if (update == nupdates and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)) \ or \ (save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) save_path = osp.join(checkdir, '%.5i' % update) print('Saving TF model to', save_path) model.save(save_path) save_dataset(save_path, nsteps, obs, rewards, returns, masks, actions, values) save_model_to_yaml(save_path, **network_kwargs) ## !! TEST !! ## # with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # __import__('ipdb').set_trace() # while(True): # da_, v_, nglp_, mean_, std_, logstd_ = policy().step_debug(obs, actions) ## !! TEST !! ## return model
load_path = 'ppo2_lstm_slow.h5' # load_path = 'ppo2_base_delayed2.h5' # load_path = 'models15/ppo_model_1.h5' # model_i = 3 model_i = '' # load_path = 'models/%s.h5' % model_i max_ticks = int(60*3*(1/0.016)) env = HaxballProcPoolVecEnv(num_fields=nenvs, max_ticks=max_ticks) policy = build_policy(env=env, policy_network='lstm', nlstm=nlstm) # policy = build_policy(env=env, policy_network='lstm', nlstm=512) # num_layers=4, num_hidden=256) nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches model = PPOModel(policy=policy, ob_space=env.observation_space, ac_space=env.action_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=0.05, vf_coef=0.5, max_grad_norm=0.5) # 0.005) #, vf_coef=0.0) if load_path is not None and os.path.exists(load_path): model.load(load_path) # model = StaticModel() # model = RandomModel(action_space=env.action_space) # model = PazzoModel(action_space=env.action_space) # model = StaticModel(default_action=7, action_space=env.action_space) # model = StaticModel(action_space=env.action_space) # nbatch = 100 * 12 # nbatch_train = nbatch // 4 # model = PPOModel(policy=policy, nsteps=12, ent_coef=0.05, ob_space=env.observation_space, ac_space=env.action_space, nbatch_act=100, nbatch_train=nbatch_train, vf_coef=0.5, max_grad_norm=0.5)# 0.005) #, vf_coef=0.0) size = width, height = 900, 520 center = (width // 2, height // 2 + 30) black = 105, 150, 90