Python MLPActorCritic Examples

Programming Language: Python

Namespace/Package Name: core

Method/Function: MLPActorCritic

Examples at hotexamples.com: 5

Python MLPActorCritic - 5 examples found. These are the top rated real world Python examples of core.MLPActorCritic extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def model(env_name, hidden_sizes, latest, algo, Path):
    """
	env_name: Environment name
	hidden_sizes: Hidden layers/nodes for neural network
	latest: Load latest model if true
	algo: algorithm used for training model
	Path: path to custom model
	"""

    env = gym.make(env_name)

    if latest == True:
        if algo == 'VPG':
            PATH2 = os.path.join(os.path.dirname(__file__), '..', 'models/VPG')
        elif algo == 'PPO':
            PATH2 = os.path.join(os.path.dirname(__file__), '..', 'models/PPO')

        # Uses latest model
        print(PATH2)
        models = glob.glob(f"{PATH2}/*")
        latest_model = max(models, key=os.path.getctime)

    else:
        # Custom file defined below
        latest_model = Path

    if algo == 'VPG':
        # Define Neural Network
        def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
            # Build a feedforward neural network.
            layers = []
            for j in range(len(sizes) - 1):
                act = activation if j < len(sizes) - 2 else output_activation
                layers += [nn.Linear(sizes[j], sizes[j + 1]), act()]
            return nn.Sequential(*layers)

        obs_dim = env.observation_space.shape[0]
        n_acts = env.action_space.n

        # Create Neural Network
        logits_net = mlp(sizes=[obs_dim] + hidden_sizes + [n_acts])
        logits_net.load_state_dict(torch.load(latest_model))

        net = logits_net

    elif algo == 'PPO':
        import core

        # Define Neural Network
        ac_kwargs = dict(hidden_sizes=hidden_sizes)
        ac = core.MLPActorCritic(env.observation_space, env.action_space,
                                 **ac_kwargs)
        ac.load_state_dict(torch.load(latest_model))

        net = ac

    return net

Example #2

Show file

    def __init__(self,
                 observation_space,
                 action_space,
                 ac_kwargs,
                 gamma=0.99,
                 alpha=0.2,
                 lr=1e-3,
                 polyak=0.995):
        self.gamma = gamma
        self.alpha = alpha
        self.lr = lr
        self.polyak = polyak

        self.ac = core.MLPActorCritic(observation_space, action_space,
                                      **ac_kwargs)
        self.ac_targ = deepcopy(self.ac)
        self.ac.to(device)
        self.ac_targ.to(device)
        for p in self.ac_targ.parameters():
            p.requires_grad = False
        self.q_params = itertools.chain(self.ac.q1.parameters(),
                                        self.ac.q2.parameters())

        self.dynam = dynam.MLPModel(observation_space.shape[0],
                                    action_space.shape[0])
        self.dynam.to(device)

        var_counts = tuple(
            core.count_vars(module)
            for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        print('\nInitial parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
              var_counts)

        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.lr)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.m_optimizer = Adam(self.dynam.parameters(), lr=self.lr * 1.0)

Example #3

Show file

def vpg(env,
        hidden_sizes,
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Vanilla Policy Gradient    (with GAE-Lambda for advantage estimation)
    Args:
        env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API.
        actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.
            The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG.
        seed (int): Seed for random number generators.
        steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch.
        epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform.
        gamma (float): Discount factor. (Always between 0 and 1.)
        pi_lr (float): Learning rate for policy optimizer.
        vf_lr (float): Learning rate for value function optimizer.
        train_v_iters (int): Number of gradient descent steps to take on value function per epoch.
        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.)
        max_ep_len (int): Maximum length of trajectory / episode / rollout.
        logger_kwargs (dict): Keyword args for EpochLogger.
        save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function.
    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # logger
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # random seeds
    seed += 1000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # 环境
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # 创建模型
    ac = core.MLPActorCritic(env.observation_space, env.action_space,
                             hidden_sizes)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer. 如果有多个线程，每个线程的经验池长度为 local_steps_per_epoch
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim,
                    act_dim,
                    size=local_steps_per_epoch,
                    gamma=gamma,
                    lam=lam)

    # optimizer
    pi_optimizer = torch.optim.Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = torch.optim.Adam(ac.v.parameters(), lr=vf_lr)

    # setup model saving
    # logger.setup_pytorch_for_mpi()

    # interaction
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(
                o, dtype=torch.float32))  # (act_dim,), (), ()
            next_o, r, d, _ = env.step(a)

            ep_ret += r
            ep_len += 1

            # save
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # update obs
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1
            if terminal or epoch_ended:  # timeout=True, terminal=True, epoch_ended=True/False
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0  # 重新初始化

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update(buf, ac, train_v_iters, pi_optimizer, vf_optimizer, logger)

        # # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()

Example #4

Show file

def main(Test='NoRTA', TrainingCases=['NoRTA'], RANGE=1000, ac_kwargs=dict(hidden_sizes=[64,64])):

	"""
	Test: Test Case
		'NoRTA', 'SVL', 'SBSF', or 'ASIF'
	Train: Training Case
		'NoRTA', 'NoRTAHP', 'SVL', 'SBSF', or 'ASIF'
	RANGE: Test Range (m)
		1000 or 10000
	ac_kwargs: Neural network parameters
		dictionary with hidden layers, ex: dict(hidden_sizes=[64,64]
	NN MODELS (below): Saved trained models
	"""

	##### NN MODELS #####
	NoRTA_model = "NoRTA2.dat"
	NoRTAHP_model = "NoRTAHP2.dat"
	SVL_model = "Velocity2.dat"
	SBSF_model = "ISimplex2.dat"
	ASIF_model = "IASIF2.dat"
	#####################

	env = gym.make('spacecraft-docking-continuous-v0')

	# Defines test points
	if RANGE == 10000:
		Distance = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
	elif RANGE == 1000:
		Distance = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

	Angle = [1.57, 5.5, 4.71, 2.36, 3.93, 0.79, 1.18, 3.14, 4.32, 0]
	Vx = [-0.1, -0.25, 0.25, 0.1, -0.5, 0.5, -0.75, 0.75, -1, 1]
	Vy = [0.1, -0.1, -0.25, 0.25, 0.5, -0.5, -0.75, 1, 0.75, -1]

	# Import ASIF
	if Test == 'SVL':
		from Simple_velocity_limit import RTA
	elif Test == 'SBSF':
		from ISimplex import RTA
	elif Test == 'ASIF':
		from IASIF import RTA

	if Test == 'SVL' or Test == 'SBSF' or Test == 'ASIF':
		# Call ASIF class
		rta = RTA(env)

		# Define action
		def RTA_act(obs, act):
			# Clip action to be within accepted range
			act = np.clip(act, -env.force_magnitude, env.force_magnitude)
			# Rearrange observation state vector
			x0 = [obs[0], obs[1], 0, obs[2], obs[3], 0]
			# Rearrange action vector
			u_des = np.array([[act[0]], [act[1]], [0]])
			# Call asif function
			u = rta.main(x0, u_des)
			# Extract relevant data
			new_act = [u[0,0], u[1,0]]
			# Determine if RTA adjusted action
			if np.sqrt((act[0] - new_act[0])**2 + (act[1] - new_act[1])**2) < 0.0001:
				# Set flag for tracking/reward function
				env.RTA_on = False
			else:
				env.RTA_on = True
			# Return new action
			return new_act

	for Train in TrainingCases:
		# Load neural network
		ac = core.MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs)
		# Load appropriate model
		if Train == 'NoRTA':
			ac.load_state_dict(torch.load(f"{PATH}/{NoRTA_model}"))
		elif Train == 'NoRTAHP':
			ac.load_state_dict(torch.load(f"{PATH}/{NoRTAHP_model}"))
		elif Train == 'SVL':
			ac.load_state_dict(torch.load(f"{PATH}/{SVL_model}"))
		elif Train == 'SBSF':
			ac.load_state_dict(torch.load(f"{PATH}/{SBSF_model}"))
		elif Train == 'ASIF':
			ac.load_state_dict(torch.load(f"{PATH}/{ASIF_model}"))

		# Use best action (mean of policy's probability distribution)
		def get_best_action(obs):
			with torch.no_grad():
				act = ac.pi.mu_net(torch.as_tensor(obs, dtype=torch.float32)).numpy()
			return act

		# Set variables
		env.termination_condition = True # Prints cause of termination
		RTA_percent = 0 # Tracks percentage of time RTA is on
		steps = 0 # Tracks number of steps

		# for 10 test points
		for i2 in range(len(Distance)):
			# Reset variables
			done = False
			env.reset()
			# Used to track trajectories for plots
			rH = []
			vH = []
			x = []
			y = []

			# Reset environment conditions for each test case
			theta = Angle[i2]
			env.position_deputy = Distance[i2]
			env.x_deputy = env.position_deputy*math.cos(theta)
			env.y_deputy = env.position_deputy*math.sin(theta)
			x_dot = Vx[i2]
			y_dot = Vy[i2]
			env.rH = env.position_deputy
			env.state = np.array([env.x_deputy, env.y_deputy, x_dot, y_dot])
			obs = env.state
			env.x_threshold = 1.5 * env.position_deputy
			env.y_threshold = 1.5 * env.position_deputy
			if RANGE == 1000 and Test != 'SBSF':
				env.max_control = 750

			# Run episode
			while not done:
				# Get best action
				act = get_best_action(obs)
				# Pass through RTA
				if Test == 'SVL' or Test == 'ASIF' or Test == 'SBSF':
					act = RTA_act(obs,act)
				# Take step in environment
				obs, _, done, _ = env.step(act)
				# Track if velocity violated constraint (No RTA)
				if Test == 'NoRTA':
					over_max_vel, _, _ = env.check_velocity(act[0], act[1])
					if over_max_vel:
						RTA_percent += 1
				# Track if RTA is on
				elif Test == 'SVL' or Test == 'ASIF' or Test == 'SBSF':
					if env.RTA_on:
						RTA_percent += 1
				steps += 1

				# Track for plotting
				rH.append(env.rH)
				vH.append(env.vH)
				x.append(obs[0])
				y.append(obs[1])

			# Plot trajectories
			plt.figure(1)
			if Train == 'NoRTAHP':
				dash = 'r'
			elif Train == 'NoRTA':
				dash = 'darkorange'
			elif Train == 'SVL':
				dash = 'b'
			elif Train == 'SBSF':
				dash = 'lime'
			elif Train == 'ASIF':
				dash = 'm'
			plt.plot(rH,vH,dash)
			plt.figure(2)
			plt.plot(x,y,dash)

		# Print RTA on percentage
		print(f"{Train} Average RTA % On: {RTA_percent/steps*100:.1f} %")

	# Plot setup
	plt.figure(1)
	plt.plot([0, 10000],[0.2, 20.74], '--', color='black',label='Max Velocity Limit')
	plt.plot([0, 10000],[-0.2, 4.935], '--', color='coral',label='Min Velocity Limit')
	# plt.title('Velocity vs. Position')
	if RANGE == 1000:
		plt.ylim([0, 2.5])
		plt.xlim([0, 1200])
	elif RANGE == 10000:
		plt.xlim([0, 10000])
		plt.ylim([0, 20])
	plt.xlabel('Distance from Chief (m)')
	plt.ylabel('Relative Velocity (m/s)')
	# plt.legend()
	plt.grid(True)

	plt.figure(2)
	# plt.title('Trajectories')
	if RANGE == 1000:
		plt.xlim([-1200, 1200])
		plt.ylim([-1200, 1200])
	elif RANGE == 10000:
		plt.xlim([-11000, 11000])
		plt.ylim([-11000, 11000])
	plt.plot(0,0,'k*', ms=10)
	plt.grid(True)
	plt.xlabel('X position (m)')
	plt.ylabel('Y position (m)')

	plt.figure(3)
	plt.plot(0,0,color='r', linewidth=2)
	plt.plot(0,0,color='darkorange', linewidth=2)
	plt.plot(0,0,color='b', linewidth=2)
	plt.plot(0,0,color='lime', linewidth=2)
	plt.plot(0,0,color='m', linewidth=2)
	plt.plot(0,0,'--',color='black')
	plt.plot(0,0,'--',color='coral')
	plt.axis('off')
	plt.legend(['Training with No RTA - HP','Training with No RTA','Training with SVL','Training with SBSF','Training with ASIF','Max Velocity Limit','Min Velocity Limit'], loc='upper center')

	plt.show()

Example #5

Show file

    # Make simulation environment
    env_fn = lambda: gym.make(args.env)
    env, test_env = env_fn(), env_fn()

    # Limit test environment steps in case agent is stuck
    test_env._max_episode_steps = 800

    # Save videos each testing iteration
    test_env = gym.wrappers.Monitor(test_env, "test-recordings", force=True)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    # Instantiate Actor Critic Neural Net and Target Network
    net = core.MLPActorCritic(env.observation_space, env.action_space)
    targ_net = deepcopy(net)

    # Freeze target network
    for p in targ_net.parameters():
        p.requires_grad = False

    # Experience / Memory Buffer
    replay_buffer = core.ReplayBuffer(obs_dim=obs_dim,
                                      act_dim=act_dim,
                                      size=int(1e6))

    # Count number of parameters in network
    param_counts = tuple(core.count_vars(module) for module in [net.pi, net.q])
    logger.log('\nNumber of Parameters: \t pi: %d, \t q: %d\n' % param_counts)
    # Set up optimization functions for policy and q-function