Ejemplo n.º 1
0
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Networks and optimizer
        self.local_network = MlpPolicy(state_size, action_size,
                                       seed).to(device)
        self.target_network = MlpPolicy(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.local_network.parameters(), lr=LR)

        # Replay memory
        self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                          seed)
        self.t_step = 0
        self.ok = 1
Ejemplo n.º 2
0
 def policy_fn(name, ob_space, ac_space):
     return MlpPolicy(name=name,
                      ob_space=env.observation_space,
                      ac_space=env.action_space,
                      hid_size=32,
                      num_hid_layers=2)
Ejemplo n.º 3
0
# Reward Function
reward_fun = rf_info2d_pos

# Policy
pi_non_linear = F.relu
pi_hid_layers = 1
pi_hid_dim = 20

pi_noise = 0.2
pi_noise_clip = 0.5

policy = MlpPolicy(state_dim,
                   action_dim=action_dim,
                   act_min=act_min,
                   act_max=act_max,
                   non_linearity=pi_non_linear,
                   hidden_layers=pi_hid_layers,
                   hidden_dim=pi_hid_dim,
                   output_non_linearity=None,
                   noise=pi_noise,
                   noise_clip=pi_noise_clip)

# Baseline Function
bl = True

bl_lr = 1e-4

bl_non_linear = F.relu
bl_hid_layers = 1
bl_hid_dim = 20

bl_fun = StateValueFunction(state_dim,
Ejemplo n.º 4
0
	def __init__(self, ob_space, ac_space, c_entropy, c_vf, session, max_grad_norm=0.5):

		sess = session

		agent_model = MlpPolicy('Mlp_agent', ob_space, ac_space, session)
		pi = agent_model.pi
		old_pi = agent_model.oldpi
		v = agent_model.vf
		critic = agent_model.critic

		#r = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="reward")
		a = tf.placeholder(dtype=tf.float32, shape=[None]+list(ac_space.shape), name="a")
		adv = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="advantage")
		target_v = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="target_v")
		old_v = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="old_v")
		LR = tf.placeholder(dtype=tf.float32, name="lr")
		CLIP_RANGE = tf.placeholder(dtype=tf.float32, shape=(), name="cliprange")
		TAU_LOCAL = tf.placeholder(dtype=tf.float32, shape=(), name="TAU_LOCAL")
		TAU_GLOBAL = tf.placeholder(dtype=tf.float32, shape=(), name="TAU_GLOBAL")

		with tf.variable_scope('losses'):
			NegLogPac = pi.neglogp(a)
			OldNegLogPac = old_pi.neglogp(a)
			ratio = tf.exp(OldNegLogPac - NegLogPac)
			surr1 = adv * ratio
			surr2 = adv * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE, 1.0 + CLIP_RANGE)
			pg_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))

			entropy = tf.reduce_mean(pi.entropy())
			v_clipped = old_v + tf.clip_by_value(v - old_v, -CLIP_RANGE, CLIP_RANGE)
			vf_losses1 = tf.square(v - target_v)
			vf_losses2 = tf.square(v_clipped - target_v)
			vf_loss = 0.5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
			simple_vf_loss = tf.reduce_mean(vf_losses1)

			approxkl = 0.5 * tf.reduce_mean(tf.square(NegLogPac - OldNegLogPac))
			clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIP_RANGE)))
			
			# critic loss
			q_loss = tf.reduce_mean(tf.square(critic - target_v))

			# actor loss
			m = tf.reduce_mean(critic, keepdims=True)
			devs_squared = tf.square(critic - m)
			reduced_var = tf.reduce_mean(devs_squared)
			reduced_std = tf.sqrt(reduced_var)
			normalized_q = (critic - m) / reduced_std
			actor_loss = -tf.reduce_mean(normalized_q) 


			#loss = pg_loss - entropy * c_entropy + vf_loss * c_vf
			loss = pg_loss + simple_vf_loss - entropy * c_entropy + actor_loss*0.5
			
			#tf.summary.scalar('total_loss', loss)
			#tf.summary.scalar('pol_loss', pg_loss)
			#tf.summary.scalar('vf_loss', simple_vf_loss)


		def _grads_placeholder_trainopt(los, para):
			grads = tf.gradients(los, para)
			if max_grad_norm is not None:
				grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
			flatten_grads = tf.concat(axis=0, 
				values=[tf.reshape(gg, shape=[int(np.prod(gg.shape))]) for gg in grads])
			feed_grads = tf.placeholder(dtype=tf.float32, shape=flatten_grads.shape, name='feed_grads')

			with tf.name_scope("Apply_grads"):
				update_list = []
				start = 0
				for p in para:
					end = start + int(np.prod(p.shape))
					update_list.append(tf.reshape(feed_grads[start:end], shape=p.shape))
					start = end
				# create grad-params pair list
				grads_list = list(zip(update_list, para))
				optimizer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
				train = optimizer.apply_gradients(grads_list)
			return flatten_grads, feed_grads, train

		# update old pi
		pi_params = agent_model.get_pol_variables()
		oldpi_params = agent_model.get_oldpol_variables()
		with tf.variable_scope('update_old_pi'):
			_updatepi = [old.assign(old*(1.0-TAU_LOCAL) + new*TAU_LOCAL) for old, new in zip(oldpi_params, pi_params)]

		self.cri_params = agent_model.get_critic_variables()
		self.pol_and_v_params = agent_model.get_ppo_variables()
		self.all_params = agent_model.get_variables()
		#self.train_params = tf.trainable_variables(scope=agent_model.scope)
		
		with tf.name_scope("critic_grads"):
			critic_grads, feed_critic, c_train = _grads_placeholder_trainopt(q_loss, self.cri_params)
		with tf.name_scope("pol_grads"):
			pol_grads, feed_pol, p_train = _grads_placeholder_trainopt(loss, self.pol_and_v_params)

		# get flattened agent parameters
		self.flat_params = tf.concat(axis=0,
			values=[tf.reshape(ap, shape=[int(np.prod(ap.shape))]) for ap in self.all_params])
		
		# placeholder for flatten params
		feed_params = tf.placeholder(dtype=tf.float32, shape=self.flat_params.shape, name='feed_params')

		## opt for params assignment
		with tf.name_scope("Apply_params"):
			p_list = []
			start = 0
			for p in self.all_params:
				end = start + int(np.prod(p.shape))
				p_list.append(tf.reshape(feed_params[start:end], shape=p.shape))
				start = end
			_apply_params = [old.assign(new*TAU_GLOBAL + (1-TAU_GLOBAL)*old) for old, new in zip(self.all_params, p_list)]

		# minibatch train
		def train(lr, cliprange, mb_obs, mb_acs, mb_adv, mb_vs, mb_targv, use_global_grad, apply_noise, scale_by_procs=True):
			mb_adv = (mb_adv - mb_adv.mean()) / mb_adv.std()

			def _train(grads, grads_placeholder, opt, feeddict):
				local_grad = sess.run(grads, feed_dict=feeddict)
				assert local_grad.ndim == 1
				if apply_noise:
					local_grad += np.random.normal(loc=0, scale=0.05, size=local_grad.shape)
				final_grad = local_grad.copy()
				if use_global_grad:
					MPI.COMM_WORLD.Allreduce(local_grad, final_grad, op=MPI.SUM)
					if scale_by_procs:
						final_grad = final_grad / MPI.COMM_WORLD.Get_size()
				sess.run(opt, feed_dict={LR: lr, grads_placeholder: final_grad})

			c_train_dict = {agent_model.ob: mb_obs,
							agent_model.pi: mb_acs,
							target_v: mb_targv}
			_train(critic_grads, feed_critic, c_train, c_train_dict)

			pol_train_dict = {agent_model.ob: mb_obs,
						a: mb_acs,
						adv: mb_adv,
						target_v: mb_targv,
						old_v: mb_vs,
						CLIP_RANGE: cliprange}

			# get loss
			ploss, vloss = sess.run([pg_loss, simple_vf_loss], feed_dict=pol_train_dict)
			_train(pol_grads, feed_pol, p_train, pol_train_dict)
			
			return ploss, vloss

		# update old pi with pi
		def update_old_pi(tau=1.0):
			sess.run(_updatepi, feed_dict={TAU_LOCAL: tau})

		def sync_params(tau=1.0):
			# get local params
			local_p = sess.run(self.flat_params)
			# prepare global buffer
			global_p = np.zeros_like(local_p)
			# sync
			MPI.COMM_WORLD.Allreduce(local_p, global_p, op=MPI.SUM)
			# scale params with agent_number
			global_p = global_p / MPI.COMM_WORLD.Get_size()
			sess.run(_apply_params, feed_dict={feed_params: global_p, TAU_GLOBAL:tau})
		
		def apply_noise(sd=0.05):
			p = sess.run(self.flat_params)
			p += np.random.normal(loc=0, scale=sd, size=p.shape)
			sess.run(_apply_params, feed_dict={feed_params: p, TAU_GLOBAL: 1.0})		

		def get_params():
			return sess.run(self.flat_params)

		def apply_params(p, tau=1.0):
			sess.run(_apply_params, feed_dict={feed_params: p, TAU_GLOBAL: tau})

		self.train = train
		self.update_old_pi = update_old_pi
		self.sync_params = sync_params
		self.agent_model = agent_model
		self.get_params = get_params
		self.apply_params = apply_params
		self.apply_noise = apply_noise
Ejemplo n.º 5
0
class DQN:
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Networks and optimizer
        self.local_network = MlpPolicy(state_size, action_size,
                                       seed).to(device)
        self.target_network = MlpPolicy(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.local_network.parameters(), lr=LR)

        # Replay memory
        self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                          seed)
        self.t_step = 0
        self.ok = 1

    def step(self, state, action, reward, next_state, done):
        self.replay_buffer.add(state, action, reward, next_state, done)
        self.t_step += 1

        if self.t_step % UPDATE_EVERY == 0:
            if len(self.replay_buffer) > BATCH_SIZE:  # if enough samples
                self.learn(self.replay_buffer.sample(), GAMMA)

    def predict(self, state, eps=0.):
        """Returns action from e-greedy policy"""
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Forward pass (no gradient)
        self.local_network.eval()
        with torch.no_grad():
            action_values = self.local_network(state)
        self.local_network.train()

        # Action pick
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experience_batch, gamma):
        states, actions, rewards, next_states, dones = experience_batch

        # Get vector of max predicted Q values for next states (from target model)
        # 'max' gets you [64,1], 'unsqueeze' and get [64]
        Q_targets_next = self.target_network(next_states).detach().max(
            1)[0].unsqueeze(1)

        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get predicted Q values from local model
        Q_predictions = self.local_network(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_predictions, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.soft_update(self.local_network, self.target_network, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Slowly update target model's parameters."""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Ejemplo n.º 6
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 c_entropy,
                 c_vf,
                 session,
                 max_grad_norm=0.5):

        sess = session

        agent_model = MlpPolicy('Mlp_agent', ob_space, ac_space, session)
        pi = agent_model.pi
        old_pi = agent_model.oldpi
        v = agent_model.vf

        a = tf.placeholder(dtype=tf.float32,
                           shape=[None] + list(ac_space.shape),
                           name="a")
        adv = tf.placeholder(dtype=tf.float32,
                             shape=[None, 1],
                             name="advantage")
        target_v = tf.placeholder(dtype=tf.float32,
                                  shape=[None, 1],
                                  name="target_v")
        old_v = tf.placeholder(dtype=tf.float32, shape=[None, 1], name="old_v")
        LR = tf.placeholder(dtype=tf.float32, name="lr")
        CLIP_RANGE = tf.placeholder(dtype=tf.float32,
                                    shape=(),
                                    name="cliprange")
        TAU = tf.placeholder(dtype=tf.float32, shape=(), name="TAU")

        with tf.variable_scope('losses'):
            NegLogPac = pi.neglogp(a)
            OldNegLogPac = old_pi.neglogp(a)
            ratio = tf.exp(OldNegLogPac - NegLogPac)
            surr1 = adv * ratio
            surr2 = adv * tf.clip_by_value(ratio, 1.0 - CLIP_RANGE,
                                           1.0 + CLIP_RANGE)
            pg_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))

            entropy = tf.reduce_mean(pi.entropy())
            v_clipped = old_v + tf.clip_by_value(v - old_v, -CLIP_RANGE,
                                                 CLIP_RANGE)
            vf_losses1 = tf.square(v - target_v)
            vf_losses2 = tf.square(v_clipped - target_v)
            vf_loss = 0.5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
            simple_vf_loss = tf.reduce_mean(vf_losses1)

            approxkl = 0.5 * tf.reduce_mean(
                tf.square(NegLogPac - OldNegLogPac))
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIP_RANGE)))
            #loss = pg_loss - entropy * c_entropy + vf_loss * c_vf
            loss = pg_loss + simple_vf_loss - entropy * c_entropy

        pi_params = agent_model.get_pol_variables()
        oldpi_params = agent_model.get_oldpol_variables()
        with tf.variable_scope('update_old_pi'):
            _updatepi = [
                old.assign(old * (1.0 - TAU) + new * TAU)
                for old, new in zip(oldpi_params, pi_params)
            ]

        params = tf.trainable_variables(scope=agent_model.scope)
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = self.optimizer.apply_gradients(grads)
        """
		def global_train(*, lr, cliprange, bobs, bacs, badv, bvs, btargv, scale_by_procs = True):
			badv = (badv - badv.mean()) / (badv.std() + 1e-8)
			feeddict={pi.ob: bobs, old_pi.ob: bobs, a: bacs, adv: badv, old_v: bvs, target_v: btargv, LR: lr, CLIP_RANGE: cliprange}
			localg = sess.run(tf.gradients(loss, params), feed_dict=feeddict)
			globalg = np.zeros_like(localg)
			MPI.COMM_WORLD.Allreduce(localg, globalg, op=MPI.SUM)
			if scale_by_procs:
				globalg /= MPI.COMM_WORLD.Get_size()
			if max_grad_norm is not None:
				globalg, _grad_norm = tf.clip_by_global_norm(globalg, max_grad_norm)
			grads = list(zip(globalg, params))
			sess.run(optimizer.apply_gradients(grads))
		"""
        def train(lr, cliprange, mb_obs, mb_acs, mb_adv, mb_vs, mb_targv):
            mb_adv = (mb_adv - mb_adv.mean()) / mb_adv.std()
            feeddict = {
                agent_model.ob: mb_obs,
                a: mb_acs,
                adv: mb_adv,
                target_v: mb_targv,
                old_v: mb_vs,
                LR: lr,
                CLIP_RANGE: cliprange
            }
            sess.run(_train, feed_dict=feeddict)
            return sess.run([pg_loss, simple_vf_loss], feed_dict=feeddict)

        def update_old_pi(tau=0.5):
            sess.run(_updatepi, feed_dict={TAU: tau})

        self.train = train
        #self.global_train = global_train
        self.update_old_pi = update_old_pi
        self.agent_model = agent_model