def init_model(self, env, domain_shape=None, masterPolicy=policies.CnnPolicy, subPolicies=policies.CnnPolicy): # self.env = gym.make(env) # if self.env.__repr__() != '<TimeLimit<NChainEnv<NChain-v0>>>': # self.env = ClipRewardEnv(FrameStack(WarpFrame(self.env), 4)) # else: # self.env = self.env.unwrapped # self.env.unwrapped.n = 10000 #if nchain environment set N to 10 000 # self.env = strechedObSpaceWrapper(self.env) # #TODO Should not be hardcoded # self.env.unwrapped.slip = 0 ob_space = env.observation_space ac_space = gym.spaces.Discrete(len(self.subs)) if masterPolicy == policies.CnnPolicy_withDomain: #isinstance(masterPolicy., policies.CnnPolicy_withDomain): assert domain_shape is not None, Exception('domain policy but no domain shape suplied') self.master = ppo2.Model(policy=masterPolicy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent, vf_coef=1, max_grad_norm=0.5, name='Master', domain_shape=domain_shape) self.domain = np.zeros((1,) + domain_shape, dtype=self.master.train_model.G.dtype.name) else: self.master = ppo2.Model(policy=masterPolicy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent, vf_coef=1, max_grad_norm=0.5, name='Master') for sub in self.subs: sub.init_model(ob_space=ob_space, ac_space=env.action_space, policy=subPolicies) # self.subs = [ppo2.Model(policy=masterPolicy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, # nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=0.01, vf_coef=1, # max_grad_norm=0.5, name=f'Sub_{i}') for i in range(self.nsubs)] self.obs = np.zeros((1,) + ob_space.shape, dtype=self.master.train_model.X.dtype.name)
def __init__(self, env, nexp, lr, lr_decay=1, cl_decay=1, nminibatches=4, n_tr_epochs=4, cliprange=0.1, gamma=0.99, lam=0.95, nenvs=1, policy=policies.CnnPolicy): ob_space = env.observation_space ac_space = env.action_space self.exp = 0 self.nsteps = nexp self.batch = self.nsteps * nenvs self.n_mb = nminibatches self.nbatch_train = self.batch // self.n_mb self.mb_obs, self.mb_rewards, self.mb_actions, self.mb_values, self.mb_dones, self.mb_neglogpacs = [],[],[],[],[],[] self.lr = lr self.lr_decay = lr_decay self.cl_decay = cl_decay self.states = None self.done = [False for _ in range(nenvs)] self.gamma = gamma self.lam = lam self.nenvs = nenvs self.n_train_epoch = n_tr_epochs self.cliprange = cliprange self.model = ppo2.Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=0.01, vf_coef=1, max_grad_norm=0.5) self.obs = np.zeros((nenvs,) + ob_space.shape, dtype=self.model.train_model.X.dtype.name)
def init_model(self, env, policy=policies.CnnPolicy): # self.env = gym.make(env) # if self.env.__repr__() != '<TimeLimit<NChainEnv<NChain-v0>>>': # self.env = ClipRewardEnv(FrameStack(WarpFrame(self.env), 4)) # else: # self.env = self.env.unwrapped # self.env.unwrapped.n = 10000 #if nchain environment set N to 10 000 # self.env = strechedObSpaceWrapper(self.env) # #TODO Should not be hardcoded # self.env.unwrapped.slip = 0 ob_space = env.observation_space ac_space = env.action_space self.model = ppo2.Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent_coef, vf_coef=1, max_grad_norm=0.5, name=self.name) self.obs = np.zeros((1,) + ob_space.shape, dtype=self.model.train_model.X.dtype.name)
def init_model(self, ob_space, ac_space, policy=policies.CnnPolicy, ent_coef=0.01): self.model = ppo2.Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1, nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent, vf_coef=1, max_grad_norm=0.5, name=self.name) self.obs = np.zeros((1,) + ob_space.shape, dtype=self.model.train_model.X.dtype.name)