Example #1
0
	def init_model(self, env, domain_shape=None, masterPolicy=policies.CnnPolicy, subPolicies=policies.CnnPolicy):
		# self.env = gym.make(env)
		# if self.env.__repr__() != '<TimeLimit<NChainEnv<NChain-v0>>>':
		# 	self.env = ClipRewardEnv(FrameStack(WarpFrame(self.env), 4))
		# else:
		# 	self.env = self.env.unwrapped
		# 	self.env.unwrapped.n = 10000  #if nchain environment set N to 10 000
		# 	self.env = strechedObSpaceWrapper(self.env)
		# 	#TODO Should not be hardcoded
		# 	self.env.unwrapped.slip = 0

		ob_space = env.observation_space
		ac_space = gym.spaces.Discrete(len(self.subs))
		if  masterPolicy == policies.CnnPolicy_withDomain: #isinstance(masterPolicy., policies.CnnPolicy_withDomain):
			assert domain_shape is not None, Exception('domain policy but no domain shape suplied')
			self.master = ppo2.Model(policy=masterPolicy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1,
									 nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent, vf_coef=1,
									 max_grad_norm=0.5, name='Master', domain_shape=domain_shape)
			self.domain = np.zeros((1,) + domain_shape, dtype=self.master.train_model.G.dtype.name)
		else:
			self.master = ppo2.Model(policy=masterPolicy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1,
								nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent, vf_coef=1,
								max_grad_norm=0.5, name='Master')
		for sub in self.subs:
			sub.init_model(ob_space=ob_space, ac_space=env.action_space, policy=subPolicies)
		# self.subs = [ppo2.Model(policy=masterPolicy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1,
		# 						nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=0.01, vf_coef=1,
		# 						max_grad_norm=0.5, name=f'Sub_{i}') for i in range(self.nsubs)]
		self.obs = np.zeros((1,) + ob_space.shape, dtype=self.master.train_model.X.dtype.name)
Example #2
0
	def __init__(self, env,  nexp, lr, lr_decay=1, cl_decay=1, nminibatches=4, n_tr_epochs=4, cliprange=0.1, gamma=0.99, lam=0.95, nenvs=1, policy=policies.CnnPolicy):
		ob_space = env.observation_space
		ac_space = env.action_space

		self.exp = 0
		self.nsteps = nexp
		self.batch = self.nsteps * nenvs
		self.n_mb = nminibatches
		self.nbatch_train = self.batch // self.n_mb
		self.mb_obs, self.mb_rewards, self.mb_actions, self.mb_values, self.mb_dones, self.mb_neglogpacs = [],[],[],[],[],[]
		self.lr = lr
		self.lr_decay = lr_decay
		self.cl_decay = cl_decay
		self.states = None
		self.done = [False for _ in range(nenvs)]
		self.gamma = gamma
		self.lam = lam
		self.nenvs = nenvs


		self.n_train_epoch = n_tr_epochs
		self.cliprange = cliprange
		self.model = ppo2.Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs,
								nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=0.01, vf_coef=1, max_grad_norm=0.5)

		self.obs = np.zeros((nenvs,) + ob_space.shape, dtype=self.model.train_model.X.dtype.name)
Example #3
0
	def init_model(self, env, policy=policies.CnnPolicy):
		# self.env = gym.make(env)
		# if self.env.__repr__() != '<TimeLimit<NChainEnv<NChain-v0>>>':
		# 	self.env = ClipRewardEnv(FrameStack(WarpFrame(self.env), 4))
		# else:
		# 	self.env = self.env.unwrapped
		# 	self.env.unwrapped.n = 10000  #if nchain environment set N to 10 000
		# 	self.env = strechedObSpaceWrapper(self.env)
		# 	#TODO Should not be hardcoded
		# 	self.env.unwrapped.slip = 0


		ob_space = env.observation_space
		ac_space = env.action_space
		self.model = ppo2.Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1,
								nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent_coef, vf_coef=1,
								max_grad_norm=0.5, name=self.name)
		self.obs = np.zeros((1,) + ob_space.shape, dtype=self.model.train_model.X.dtype.name)
Example #4
0
	def init_model(self, ob_space, ac_space, policy=policies.CnnPolicy, ent_coef=0.01):

		self.model = ppo2.Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=1,
								nbatch_train=self.nbatch_train, nsteps=self.nsteps, ent_coef=self.ent, vf_coef=1,
								max_grad_norm=0.5, name=self.name)
		self.obs = np.zeros((1,) + ob_space.shape, dtype=self.model.train_model.X.dtype.name)