def test_lstm_example(): import tensorflow as tf from baselines.common import policies, models, cmd_util from baselines.common.vec_env.dummy_vec_env import DummyVecEnv # create vectorized environment venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) with tf.Session() as sess: # build policy based on lstm network with 128 units policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) # initialize tensorflow variables sess.run(tf.global_variables_initializer()) # prepare environment variables ob = venv.reset() state = policy.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: action, _, state, _ = policy.step(ob, S=state, M=done) ob, reward, done, _ = venv.step(action) step_counter += 1 if done: break assert step_counter > 5
def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): def seeded_env_fn(): env = env_fn() env.seed(0) return env np.random.seed(0) env = DummyVecEnv([seeded_env_fn]) with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): tf.set_random_seed(0) model = learn_fn(env) sum_rew = 0 done = True for i in range(n_trials): if done: obs = env.reset() state = model.initial_state if state is not None: a, v, state, _ = model.step(obs, S=state, M=[False]) else: a, v, _, _ = model.step(obs) obs, rew, done, _ = env.step(a) sum_rew += float(rew) print("Reward in {} trials is {}".format(n_trials, sum_rew)) assert sum_rew > min_reward_fraction * n_trials, \ 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
def test_serialization(learn_fn, network_fn): ''' Test if the trained model can be serialized ''' if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return def make_env(): env = MnistEnv(episode_len=100) env.seed(10) return env env = DummyVecEnv([make_env]) ob = env.reset().copy() learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) with tempfile.TemporaryDirectory() as td: model_path = os.path.join(td, 'serialization_test_model') with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=100) model.save(model_path) mean1, std1 = _get_action_stats(model, ob) variables_dict1 = _serialize_variables() with tf.Graph().as_default(), make_session().as_default(): model = learn(total_timesteps=0, load_path=model_path) mean2, std2 = _get_action_stats(model, ob) variables_dict2 = _serialize_variables() for k, v in variables_dict1.items(): np.testing.assert_allclose(v, variables_dict2[k], atol=0.01, err_msg='saved and loaded variable {} value mismatch'.format(k)) np.testing.assert_allclose(mean1, mean2, atol=0.5) np.testing.assert_allclose(std1, std2, atol=0.5)
def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): env = DummyVecEnv([env_fn]) with tf.Graph().as_default(), tf.compat.v1.Session(config=_sess_config).as_default(): model = learn_fn(env) N_TRIALS = 100 observations, actions, rewards = rollout(env, model, N_TRIALS) rewards = [sum(r) for r in rewards] avg_rew = sum(rewards) / N_TRIALS print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) assert avg_rew > min_avg_reward, \ 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
def test_microbatches(): def env_fn(): env = gym.make('CartPole-v0') env.seed(0) return env learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0) env_ref = DummyVecEnv([env_fn]) sess_ref = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_ref) vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()} env_test = DummyVecEnv([env_fn]) sess_test = make_session(make_default=True, graph=tf.Graph()) learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2)) vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()} for v in vars_ref: np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=1e-3)
def evaluation(policy, num_times): from baselines.common.vec_env.dummy_vec_env import DummyVecEnv def make_env(): df = pd.read_csv('dataset/btc_indexed2_test.csv') env = trading_env.make( env_id='training_v1', obs_data_len=1, step_len=1, df=df, fee=0.003, max_position=5, deal_col_name='close', return_transaction=True, sample_days=30, normalize_reward=False, feature_names=['open', 'high', 'low', 'close', 'volume']) env = wrapper.LogPriceFilterWrapper(env) return env env = DummyVecEnv([make_env] * 8) rewards = [] for i in range(num_times): episode_reward = np.zeros(8) ob = env.reset() lstm_state = policy.initial_state not_done = [1] * 8 while True: action, _, lstm_state, _ = policy.step(ob, lstm_state, not_done) ob, reward, done, _ = env.step(action) #print(reward) #print(reward.shape) episode_reward += reward print(done) if done.all(): break not_done = np.invert(done).astype(int) print("evaluation ", i, ": ", episode_reward) rewards.append(list(episode_reward)) print("evaluation: mu:", np.mean(rewards), "std:", np.std(rewards))
def main(): config = tf.ConfigProto() # Avoid warning message errors os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Allowing GPU memory growth config.gpu_options.allow_growth = True with tf.Session(config=config): model.play(policy=policies.A2CPolicy, env=DummyVecEnv([env.make_train_0]))
def create_environment(self): envs = [ make_env(i, args, True, self.gan_file) for i in range(self.num_processes) ] envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs, gamma=args.gamma) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) return envs, obs_shape
def train(env_id, num_timesteps, seed, pol, cur, vis, model): from baselines.common import set_global_seeds from baselines.ppo2 import ppo2 from baselines.ppo2.policies import HierPolicy, HierPolicy2, MlpPolicy, RandomWalkPolicy import gym import gym_program import tensorflow as tf from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() hier = True if pol == 'hier1' or pol == 'hier2' else False def make_env(): set_global_seeds(seed) env = gym.make(env_id) env.set_curiosity(cur, model) env.set_hier(hier) env.set_visualize(vis) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) if pol == 'hier1': policy = HierPolicy elif pol == 'hier2': policy = HierPolicy2 elif policy == 'mlp': policy = MlpPolicy elif pol == 'random_walk': pol = RandomWalkPolicy pol(env) return ppo2.learn(policy=policy, env=env, pol=pol, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=1e-4, cliprange=0.2, total_timesteps=num_timesteps)
def play(): env_args = dict() network_kwargs = dict(nlstm=512) # create vectorized environment pysc2_env_vec = DummyVecEnv([partial(make_sc2env, id=i, **env_args) for i in range(1)]) policy = policies.build_policy(pysc2_env_vec, "cnn_lstm", **network_kwargs) nenvs = pysc2_env_vec.num_envs # Calculate the batch_size nsteps=256 nminibatches=1 nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches ent_coef=0.0 vf_coef=0.5 max_grad_norm=0.5 make_model = lambda : ppo_model(policy=policy, ob_space=(64, 64, 3), ac_space=65, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() model.load("4860_ppo_cnn_lstm_512_medium") images = [] ob = pysc2_env_vec.reset() state = model.initial_state done = [False] step_counter = 0 # run a single episode until the end (i.e. until done) while True: #print(step_counter) images.append(ob) action, _, state, _ = model.step(ob, S=state, M=done) ob, _, done, stats = pysc2_env_vec.step(action) step_counter += 1 if(done[0]): imageio.mimsave(str(stats[0]["final_reward"]) + "_" + str(difficulty) + '.gif', [np.array(img[0]) for i, img in enumerate(images) if i%2 == 0], fps=4) images = []
def run_baselines(env, seed, log_dir): """Create baselines model and training. Args: env (dict): Environment of the task. seed (int): Random positive integer for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.compat.v1.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) env = DummyVecEnv([ lambda: bench.Monitor( env, baselines_logger.get_dir(), allow_early_resets=True) ]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy nbatch = env.num_envs * hyper_parameters['batch_size'] training_batch_number = nbatch // hyper_parameters['training_batch_size'] # import pdb; pdb.set_trace() # use AdamOptimizer as optimizer and choose value function same with policy ppo2.learn(policy=policy, env=env, nsteps=hyper_parameters['batch_size'], lam=hyper_parameters['gae_lambda'], gamma=hyper_parameters['discount'], ent_coef=hyper_parameters['policy_ent_coeff'], nminibatches=training_batch_number, noptepochs=hyper_parameters['training_epochs'], max_grad_norm=None, lr=hyper_parameters['learning_rate'], cliprange=hyper_parameters['lr_clip_range'], total_timesteps=hyper_parameters['batch_size'] * hyper_parameters['n_epochs']) # yapf: disable # noqa: E501 return osp.join(log_dir, 'progress.csv')
def ppo(): def make_env(): env = SawyerEnvWrapper(DownEnv(for_her=False)) return env tf.Session().__enter__() env = VecNormalize(DummyVecEnv([make_env])) policy = MlpPolicy model = ppo2.learn(policy=policy, env=env, nsteps=4000, nminibatches=1, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=1e8) return model
def make_env(env, seed, device): torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) def _thunk(): env_ = gym.make(env) env_.seed(seed) return env_ envs = DummyVecEnv([_thunk]) envs = VecNormalize(envs, ret=False) envs = VecPyTorch(envs, device) return envs
def PrepareParallelEnv(env_id, seed, num_processes, gamma, log_dir, device, allow_early_resets): envs = [ PrepareMujocoEnv(env_id, seed, i, log_dir, allow_early_resets) for i in range(num_processes) ] if len(envs) > 1: envs = ShmemVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) return envs
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, env_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None, initializer=None, force_dummy=False, obs_type='original', fixed_num_of_contact=0): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank, initializer=None): return lambda: make_env(env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations= flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, env_kwargs=env_kwargs, logger_dir=logger_dir, initializer=initializer, obs_type=obs_type, fixed_num_of_contact=fixed_num_of_contact) set_global_seeds(seed) if not force_dummy and num_env > 1: return SubprocVecEnv([ make_thunk(i + start_index, initializer=initializer) for i in range(num_env) ]) else: return DummyVecEnv([ make_thunk(i + start_index, initializer=None) for i in range(num_env) ])
def make_vec_envs(env_name, scene_path, seed, num_processes, gamma, log_dir, device, allow_early_resets, initial_policies, num_frame_stack=None, show=False, no_norm=False, pose_estimator=None, image_ips=None, init_control=True): envs = [ make_env(env_name, scene_path, seed, i, log_dir, allow_early_resets, show, init_control) for i in range(num_processes) ] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) envs = wrap_initial_policies(envs, device, initial_policies) if pose_estimator is not None: envs = SimImageObsVecEnvWrapper(envs) if len(envs.observation_space.shape) == 1 and not no_norm: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if pose_estimator is not None: envs = wrap_initial_policies(envs, device, image_ips) envs = PoseEstimatorVecEnvWrapper(envs, device, *pose_estimator, abs_to_rel=True) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif not pose_estimator and len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, device, allow_early_resets, num_frame_stack=None, rank=0, signature='', max_steps=None, env_group_spec=None): """ Make vectorised environments for parallelized experience sampling. """ # Should environments be the all the same for each learner or differ across processes for the # same learner. heterogeneous_envs = not (env_group_spec is not None and env_group_spec[1] == num_processes) if env_group_spec is None or env_group_spec[0] == 1: # No grouping of environment processes for each agent. envs = [ make_env(env_name, seed + num_processes * rank, (rank * num_processes) + i, log_dir, allow_early_resets, signature, max_steps, heterogeneous=heterogeneous_envs) for i in range(num_processes) ] else: # We have environments grouped such that environments differ even for the same learner. envs = [] counter = 0 for i in range(env_group_spec[0]): envs += [ make_env(env_name, seed + num_processes * rank, (rank * num_processes) + counter + i, log_dir, allow_early_resets, signature, max_steps, heterogeneous=False) for i in range(env_group_spec[1]) ] seed += env_group_spec[1] counter += env_group_spec[1] # Allow dummy environment wrapper if no parallelisation required. if len(envs) > 1: envs = ShmemVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) # Ensure environments are compatible with the PyTorch agents. envs = VecPyTorch(envs, device) # Frame stacking for visual environments. if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def train(env_id, num_timesteps, seed): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2 from baselines.ppo2.policies import MlpPolicy import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): if env_id == 'toy': #env = continuous_gridworld.ContinuousGridworld('', max_steps=1000, # obstacle_mode=continuous_gridworld.NO_OBJECTS) from toy_environment import room_obstacle_list env = gridworld.Gridworld( obstacle_list_generator=room_obstacle_list.obstacle_list) elif env_id == 'navigate': env = NavigateEnv(use_camera=False, continuous_actions=True, neg_reward=True, max_steps=500) elif env_id == 'arm2pos': #env = Arm2PosEnv(continuous=False, max_steps=500) pass else: env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=3e-4, cliprange=0.2, total_timesteps=num_timesteps)
def train(model_name, num_processes, max_grad_norm, num_env_steps, log_dir, epoch, env_name, save_dir, use_linear_clip_decay): records = [] envs = [make_env(rank = i) for i in range(num_processes)] replaybuffer = Buffer() if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) try: state_shape = envs.observation_space.shape[0] action_shape = envs.action_space.shape[0] model = model_dict[model_name](state_shape, action_shape) cumpute_loss = loss_dict[model_name] optimizer = torch.optim.Adam(model.parameters()) state = envs.reset() returns = 0 for t in range(num_env_steps//num_processes): action, log_prob = model.act(state) next_state, reward, done, info = envs.step(to_np(action)) returns += reward replaybuffer.store(zip(state, to_np(action), to_np(log_prob), reward, next_state, 1 - done)) for i, d in enumerate(done): if d: records.append((t * num_processes + i, returns[i])) if i==0: print(returns[0]) returns[i] = 0 state = next_state if t % 500//num_processes == (500//num_processes-1): for _ in range(epoch): optimizer.zero_grad() loss = cumpute_loss(replaybuffer.sample(), model) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() if model_name == 'PPO' or model_name == 'DPPO': replaybuffer.clear() if t % (num_env_steps//num_processes//10) == 0: i = t//(num_env_steps//num_processes//10) torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+str(i)+'.pt')) if use_linear_clip_decay: update_linear_schedule(optimizer, t * num_processes) torch.save(model.state_dict(), os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt')) timesteps , sumofrewards = zip(*records) savemat(os.path.join(save_dir, model_name,env_name,'returns.mat'),{'timesteps':timesteps, 'returns':sumofrewards}) except Exception as e: traceback.print_exc() finally: envs.close()
def testing(model): """ We'll use this function to calculate the score on test levels for each saved model, to generate the video version to generate the map version """ test_env = DummyVecEnv([sonic_env.make_test]) # Get state_space and action_space ob_space = test_env.observation_space ac_space = test_env.action_space # Play total_score = 0 trial = 0 # We make 3 trials for trial in range(3): obs = test_env.reset() done = False score = 0 while done == False: # Get the action action, value, _ = model.step(obs) # Take action in env and look the results obs, reward, done, info = test_env.step(action) score += reward[0] total_score += score trial += 1 test_env.close() # Divide the score by the number of trials total_test_score = total_score / 3 return total_test_score
def make_vec_envs(env_name, rep_type, resolution, seed, scenario, num_processes, gamma, log_dir, device, allow_early_resets, num_frame_stack=None, patch_count=3, reverse_green=False): envs = [ make_env(env_name,rep_type,resolution,seed,scenario ,i, log_dir, allow_early_resets,patch_count=patch_count,reverse_green=reverse_green) for i in range(num_processes) ] if len(envs) > 1: #envs = ShmemVecEnv(envs, context='fork') envs = DummyVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == -1: #problematic, original 3 envs = VecPyTorchFrameStack(envs, 4, device) return envs
def train(env_id, num_timesteps, seed, lr, lr_q, cliprangeq): from baselines.common import set_global_seeds from baselines.common.vec_env.vec_normalize import VecNormalize from baselines.ppo2 import ppo2_implicit from baselines.ppo2.policies import ImplicitMLPPolicy import gym import tensorflow as tf from baselines.common.vec_env.dummy_vec_env import DummyVecEnv ncpu = 1 config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() def make_env(): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = ImplicitMLPPolicy # build call back arg = {} arg['seed'] = seed arg['env'] = env_id arg['lr'] = lr arg['lrq'] = lr_q arg['cliprangeq'] = cliprangeq callback = CALLBACK(arg) cliprangeq = linear_schedule(maxrate=cliprangeq, minrate=0.001) ppo2_implicit.learn(policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=lr, lr_q=lr_q, cliprangeq=cliprangeq, total_timesteps=num_timesteps, callback=callback)
def test_identity(learn_func): ''' Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) ''' np.random.seed(0) np_random.seed(0) random.seed(0) env = DummyVecEnv([lambda: IdentityEnv(10)]) with tf.Graph().as_default(), tf.Session().as_default(): tf.set_random_seed(0) model = learn_func(env) N_TRIALS = 1000 sum_rew = 0 obs = env.reset() for i in range(N_TRIALS): obs, rew, done, _ = env.step(model.step(obs)[0]) sum_rew += rew assert sum_rew > 0.9 * N_TRIALS
def _make_env(env_fn, nenv): def _env(rank): def _thunk(): return env_fn(rank=rank) return _thunk if nenv > 1: env = SubprocVecEnv([_env(i) for i in range(nenv)]) else: env = DummyVecEnv([_env(0)]) env = SuccessWrapper(env) tstart = 0 return VecMonitor(env, max_history=100, tstart=tstart, tbX=True)
def make_vec_envs(args, device="cpu"): envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.default_ind, \ args.num_envs, args.basepath) for i in range(args.num_processes) ] if len(envs) > 1: envs = ShmemVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) envs = VecPyTorch(envs, device) return envs
def run_baselines(env, seed, log_dir): ''' Create baselines model and training. Replace the ppo and its training with the algorithm you want to run. :param env: Environment of the task. :param seed: Random seed for the trial. :param log_dir: Log dir path. :return ''' ncpu = max(multiprocessing.cpu_count() // 2, 1) config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) tf.Session(config=config).__enter__() # Set up logger for baselines configure(dir=log_dir, format_strs=['stdout', 'log', 'csv', 'tensorboard']) baselines_logger.info('rank {}: seed={}, logdir={}'.format( 0, seed, baselines_logger.get_dir())) def make_env(): monitor = bench.Monitor( env, baselines_logger.get_dir(), allow_early_resets=True) return monitor env = DummyVecEnv([make_env]) env = VecNormalize(env) set_global_seeds(seed) policy = MlpPolicy ppo2.learn( policy=policy, env=env, nsteps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, log_interval=1, ent_coef=0.0, lr=1e-3, vf_coef=0.5, max_grad_norm=None, cliprange=0.2, total_timesteps=int(1e6)) return osp.join(log_dir, 'progress.csv')
def train(args): # exploration score type if 'MiniGrid' in args.env: args.score_type = 'discrete' args.train_rl = True policy_fn = MlpPolicy elif args.env == 'MiniWorld-MazeS5-v0': args.score_type = 'continious' args.train_rl = True policy_fn = CnnPolicy else: # MuJoCo args.score_type = 'continious' if args.disable_rapid: args.train_rl = True else: args.train_rl = False policy_fn = MlpPolicy # Make the environment def _make_env(): env = make_env(args.env) env.seed(args.seed) return env env = DummyVecEnv([_make_env]) if not 'MiniGrid' in args.env and not args.env == 'MiniWorld-MazeS5-v0': # Mujoco env = VecNormalize(env) # Initialize the buffer ranking_buffer = RankingBuffer(ob_space=env.observation_space, ac_space=env.action_space, args=args) # Start training learn(policy_fn, env, ranking_buffer, args) env.close()
def make_vec_envs_domains(env_name, seed, num_processes, gamma, log_dir, device, allow_early_resets, num_envs1, num_envs2, num_frame_stack=None, env_kwargs1=None, env_kwargs2=None): # Environments from domain 1 num_envs_domain1 = num_envs1 # int(num_processes/2) num_envs_domain2 = num_envs2 # int(num_processes/2) envs1 = [ make_env(env_name, seed, i, log_dir, allow_early_resets, env_kwargs1) for i in range(num_envs_domain1) ] # Environments from domain 2 envs2 = [ make_env(env_name, seed, i, log_dir, allow_early_resets, env_kwargs2) for i in range(num_envs_domain2) ] # Concatenate envs envs = envs1 + envs2 if len(envs) > 1: envs = ShmemVecEnv_DR(envs, context='fork') else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def make_vec_env(env_id, seed): """ Create environment """ env = gym.make(env_id) env.seed(seed) def make_env(env): return lambda: env env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), '0'), allow_early_resets=True) set_global_seeds(seed) return DummyVecEnv([make_env(env)])
def make_vec_envs_custom(constants, device, env_lambda): # Construct envs envs = [ env_lambda for i in range(constants["num_processes"]) ] # Multiple processes if len(envs) > 1: envs = ShmemVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) # Put on gpu whatever can be envs = VecPyTorch(envs, device) return envs
def make_vec_env(env_id, seed): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ env = gym.make(env_id) env.seed(seed) def make_thunk(env): return lambda: env env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), '0'), allow_early_resets=True) set_global_seeds(seed) return DummyVecEnv([make_thunk(env)])
def make_vec_random_env(num_envs: int, mk_config: Union[MkConfig, Dict]) -> VectorEnv: # Move import here in case we don't have `baselines` installed: # TODO: Use the "native" vectorized envs from gym rather than those from baselines. # The only thing we'd lose is the ability to render the envs, which isn't part of # gym at the time of writing. One potential solution would be to use a fork of gym # which adds this support for rendering the envs. from baselines.common.vec_env.dummy_vec_env import DummyVecEnv from baselines.common.vec_env.shmem_vec_env import ShmemVecEnv env_func = partial(make_env, mk_config=mk_config) if num_envs == 1: return DummyVecEnv([env_func for _ in range(num_envs)]) return ShmemVecEnv([env_func for _ in range(num_envs)])
def make_vec_envs(args, seed, num_processes, gamma, device): envs = [make_env(args, seed, i) for i in range(num_processes)] if len(envs) > 1: envs = ShmemVecEnv(envs, context='fork') else: envs = DummyVecEnv(envs) envs = VecPyTorch(envs, device) return envs
def build_env(args): envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) return envs