def Train(): setup_utils.setup_and_load(use_cmd_line_args=False, set_seed=3, num_levels=1, use_black_white=True, frame_stack=4) # env=make("platform",num_envs=8) env = make("platform", num_envs=128) env = CourierWrapper(env, False) env = MyReward(env) env = StickAct(env, 0.5) env = VecMonitor(env) learning_rate = 5e-4 clip_range = 0.2 n_timesteps = int(1e8) hyperparmas = { 'nsteps': 256, 'noptepochs': 4, 'nminibatches': 8, 'lr': learning_rate, 'cliprange': clip_range, 'vf_coef': 0.5, 'ent_coef': 0.01 } act = ppo2.learn( network=MyPolicy, env=env, total_timesteps=n_timesteps, **hyperparmas, save_interval=100, log_interval=20, # value_network="copy" )
def __call__(self, env_maker, seed=None, monitor_file=None): """ :param env_maker: instance of roam_learning.robot_env.EnvMaker :param seed: int that is used to generate seeds for vectorized envs :param monitor_file: path to a .csv file to log episode rewards, lengths etc,. of the vectorized envs :return: instance of either DummyVecEnv, SubprocVecEnv or ShmemVecEnv """ # Create a list of env makers if seed is not None: assert isinstance(seed, int) env_makers = [] for i in range(self.nenvs): env_makers += [deepcopy(env_maker)] if seed is not None: seed = hash_seed(seed) env_makers[i].set_seed(seed + i) # Create the vectorized envs envs = self.vec_env_wrapper(env_makers) # Monitor the envs before normalization if monitor_file is not None: envs = VecMonitor(envs, filename=monitor_file) if self.normalize_obs or self.normalize_ret: envs = VecNormalize(envs, ob=self.normalize_obs, ret=self.normalize_ret, use_tf=True) return envs
def Eval(): def EnvFunc(iSeed): def InnerFunc(): oEnv=Env() return oEnv return InnerFunc def linear_schedule(initial_value): def func(process): return process * initial_value return func learning_rate = linear_schedule(5e-4) clip_range = linear_schedule(0.2) n_timesteps = int(0) hyperparmas = {'nsteps': 256, 'noptepochs': 8, 'nminibatches': 4, 'lr': learning_rate, 'cliprange': clip_range, 'vf_coef': 0.5, 'ent_coef': 0.01} num_env = 1 env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)]) env = VecNormalize(env,ob=True,ret=False) env=VecMonitor(env) act = ppo2.learn( network="mlp", env=env, total_timesteps=n_timesteps, save_interval=100, load_path="baselineLog/ppobaseliens-2019-06-05-17-38-15-168854/checkpoints/00300", **hyperparmas, value_network="copy" ) obs = env.reset() print("obs", obs.shape) bDone = False iFrame = 0 iReward = 0 reward_list=deque(maxlen=100) while not bDone: action = act.step(obs)[0] obs, reward, done, _ = env.step(action) iReward += reward[0] # time.sleep(0.01) # print("reward",reward) iFrame += 1 # env.render() if done[0]: obs = env.reset() reward_list.append(iReward) print("done.................", iFrame, iReward,sum(reward_list)/len(reward_list)) iFrame = 0 iReward = 0
def Train(): logdir = "baselineLog/ppo" + datetime.datetime.now().strftime("baseliens-%Y-%m-%d-%H-%M-%S-%f") logger.configure(logdir, ["tensorboard", "stdout"]) def EnvFunc(iIndex): def InnerFunc(): oEnv=Env(iIndex) return oEnv return InnerFunc def linear_schedule(initial_value): def func(process): return process * initial_value return func learning_rate = linear_schedule(3e-4) clip_range = linear_schedule(0.2) n_timesteps = int(1e8) hyperparmas = {'nsteps': 1024, 'noptepochs': 10, 'nminibatches': 32, 'lr': learning_rate, 'cliprange': clip_range, 'vf_coef': 0.5, 'ent_coef': 0.0} num_env = 22 env = SubprocVecEnv([EnvFunc(i) for i in range(num_env)]) env=VecMonitor(env) env=VecNormalize(env,cliprew=5000.,use_tf=True) act = ppo2.learn( network="mlp", env=env, total_timesteps=n_timesteps, save_interval=100, log_interval=4, # load_path="/tmp/openai-2019-05-30-11-53-14-660522/checkpoints/16000", **hyperparmas, value_network="copy" )
def __call__(self, n_envs=1, *, train=True): env_fn = EnvMaker(self.env_id) if ( "AtariEnv" in gym.spec(self.env_id)._entry_point and "-ram-" not in self.env_id ): if n_envs == 1: vec_env = DummyVecEnv([env_fn]) else: vec_env = ShmEnvPool(env_fn, n_envs=n_envs) vec_env = VecFrameStack(vec_env, 4) else: if n_envs == 1: vec_env = DummyVecEnv([env_fn]) else: vec_env = EnvPool(env_fn, n_envs=n_envs) monitor_dir = os.path.join( logger.get_dir(), ("train" if train else "eval") + "_monitor" ) os.makedirs(monitor_dir, exist_ok=True) vec_env = VecMonitor(vec_env, filename=monitor_dir) return vec_env
from baselines import bench from baselines import logger from baselines.common.vec_env.dummy_vec_env import DummyVecEnv from baselines.common.vec_env.vec_monitor import VecMonitor from baselines.ppo2 import ppo2 BeraterEnv.BeraterEnv.showStep = False BeraterEnv.BeraterEnv.showDone = True print("--- PPO2 learn ---") env = BeraterEnv.BeraterEnv( currentGraph ) wrapped_env = DummyVecEnv([lambda: BeraterEnv.BeraterEnv(currentGraph)]) monitored_env = VecMonitor(wrapped_env, log_dir) # https://github.com/openai/baselines/blob/master/baselines/ppo2/ppo2.py # https://github.com/openai/baselines/blob/master/baselines/common/models.py#L30 model = ppo2.learn(\ env=monitored_env,\ network='mlp',\ num_hidden=50,\ num_layers=2,\ ent_coef=0.01,\ total_timesteps=5000) model.save('berater-ppo-v8.pkl') monitored_env.close() ##################################################
import gym from gym.wrappers.flatten_observation import FlattenObservation from baselines.ppo2.ppo2 import learn from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv from baselines.common.vec_env.dummy_vec_env import DummyVecEnv from baselines.common.vec_env.vec_monitor import VecMonitor ENV = "FetchReach-v1" def run_env(): env = gym.make(ENV) env.reset() while True: action = env.action_space.sample() env.step(action) env.render() def make_env(): env = gym.make(ENV) env = FlattenObservation(env) return env if __name__ == '__main__': nenvs = 4 env_fns = [make_env for _ in range(4)] env = VecMonitor(SubprocVecEnv(env_fns)) learn(network='mlp', env=env, total_timesteps=int(1e5), log_interval=1)