Example #1
0
def model(env_name, hidden_sizes, latest, algo, Path):
    """
	env_name: Environment name
	hidden_sizes: Hidden layers/nodes for neural network
	latest: Load latest model if true
	algo: algorithm used for training model
	Path: path to custom model
	"""

    env = gym.make(env_name)

    if latest == True:
        if algo == 'VPG':
            PATH2 = os.path.join(os.path.dirname(__file__), '..', 'models/VPG')
        elif algo == 'PPO':
            PATH2 = os.path.join(os.path.dirname(__file__), '..', 'models/PPO')

        # Uses latest model
        print(PATH2)
        models = glob.glob(f"{PATH2}/*")
        latest_model = max(models, key=os.path.getctime)

    else:
        # Custom file defined below
        latest_model = Path

    if algo == 'VPG':
        # Define Neural Network
        def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
            # Build a feedforward neural network.
            layers = []
            for j in range(len(sizes) - 1):
                act = activation if j < len(sizes) - 2 else output_activation
                layers += [nn.Linear(sizes[j], sizes[j + 1]), act()]
            return nn.Sequential(*layers)

        obs_dim = env.observation_space.shape[0]
        n_acts = env.action_space.n

        # Create Neural Network
        logits_net = mlp(sizes=[obs_dim] + hidden_sizes + [n_acts])
        logits_net.load_state_dict(torch.load(latest_model))

        net = logits_net

    elif algo == 'PPO':
        import core

        # Define Neural Network
        ac_kwargs = dict(hidden_sizes=hidden_sizes)
        ac = core.MLPActorCritic(env.observation_space, env.action_space,
                                 **ac_kwargs)
        ac.load_state_dict(torch.load(latest_model))

        net = ac

    return net
Example #2
0
    def __init__(self,
                 observation_space,
                 action_space,
                 ac_kwargs,
                 gamma=0.99,
                 alpha=0.2,
                 lr=1e-3,
                 polyak=0.995):
        self.gamma = gamma
        self.alpha = alpha
        self.lr = lr
        self.polyak = polyak

        self.ac = core.MLPActorCritic(observation_space, action_space,
                                      **ac_kwargs)
        self.ac_targ = deepcopy(self.ac)
        self.ac.to(device)
        self.ac_targ.to(device)
        for p in self.ac_targ.parameters():
            p.requires_grad = False
        self.q_params = itertools.chain(self.ac.q1.parameters(),
                                        self.ac.q2.parameters())

        self.dynam = dynam.MLPModel(observation_space.shape[0],
                                    action_space.shape[0])
        self.dynam.to(device)

        var_counts = tuple(
            core.count_vars(module)
            for module in [self.ac.pi, self.ac.q1, self.ac.q2])
        print('\nInitial parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' %
              var_counts)

        self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.lr)
        self.q_optimizer = Adam(self.q_params, lr=self.lr)
        self.m_optimizer = Adam(self.dynam.parameters(), lr=self.lr * 1.0)
Example #3
0
def vpg(env,
        hidden_sizes,
        seed=0,
        steps_per_epoch=4000,
        epochs=50,
        gamma=0.99,
        pi_lr=3e-4,
        vf_lr=1e-3,
        train_v_iters=80,
        lam=0.97,
        max_ep_len=1000,
        logger_kwargs=dict(),
        save_freq=10):
    """
    Vanilla Policy Gradient    (with GAE-Lambda for advantage estimation)
    Args:
        env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API.
        actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` 
            module. The ``step`` method should accept a batch of observations and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``a``        (batch, act_dim)  | Numpy array of actions for each 
                                           | observation.
            ``v``        (batch,)          | Numpy array of value estimates
                                           | for the provided observations.
            ``logp_a``   (batch,)          | Numpy array of log probs for the
                                           | actions in ``a``.
            ===========  ================  ======================================

            The ``act`` method behaves the same as ``step`` but only returns ``a``.
            The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``pi``       N/A               | Torch Distribution object, containing
                                           | a batch of distributions describing
                                           | the policy for the provided observations.
            ``logp_a``   (batch,)          | Optional (only returned if batch of
                                           | actions is given). Tensor containing 
                                           | the log probability, according to 
                                           | the policy, of the provided actions.
                                           | If actions not given, will contain
                                           | ``None``.
            ===========  ================  ======================================

            The ``v`` module's forward call should accept a batch of observations and return:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``v``        (batch,)          | Tensor containing the value estimates
                                           | for the provided observations. (Critical: 
                                           | make sure to flatten this!)
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to VPG.
        seed (int): Seed for random number generators.
        steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch.
        epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform.
        gamma (float): Discount factor. (Always between 0 and 1.)
        pi_lr (float): Learning rate for policy optimizer.
        vf_lr (float): Learning rate for value function optimizer.
        train_v_iters (int): Number of gradient descent steps to take on value function per epoch.
        lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.)
        max_ep_len (int): Maximum length of trajectory / episode / rollout.
        logger_kwargs (dict): Keyword args for EpochLogger.
        save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function.
    """

    # Special function to avoid certain slowdowns from PyTorch + MPI combo.
    setup_pytorch_for_mpi()

    # logger
    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    # random seeds
    seed += 1000 * proc_id()
    torch.manual_seed(seed)
    np.random.seed(seed)

    # 环境
    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    # 创建模型
    ac = core.MLPActorCritic(env.observation_space, env.action_space,
                             hidden_sizes)

    # Sync params across processes
    sync_params(ac)

    # Count variables
    var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v])
    logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts)

    # Set up experience buffer. 如果有多个线程,每个线程的经验池长度为 local_steps_per_epoch
    local_steps_per_epoch = int(steps_per_epoch / num_procs())
    buf = VPGBuffer(obs_dim,
                    act_dim,
                    size=local_steps_per_epoch,
                    gamma=gamma,
                    lam=lam)

    # optimizer
    pi_optimizer = torch.optim.Adam(ac.pi.parameters(), lr=pi_lr)
    vf_optimizer = torch.optim.Adam(ac.v.parameters(), lr=vf_lr)

    # setup model saving
    # logger.setup_pytorch_for_mpi()

    # interaction
    start_time = time.time()
    o, ep_ret, ep_len = env.reset(), 0, 0

    for epoch in range(epochs):
        for t in range(local_steps_per_epoch):
            a, v, logp = ac.step(torch.as_tensor(
                o, dtype=torch.float32))  # (act_dim,), (), ()
            next_o, r, d, _ = env.step(a)

            ep_ret += r
            ep_len += 1

            # save
            buf.store(o, a, r, v, logp)
            logger.store(VVals=v)

            # update obs
            o = next_o

            timeout = ep_len == max_ep_len
            terminal = d or timeout
            epoch_ended = t == local_steps_per_epoch - 1
            if terminal or epoch_ended:  # timeout=True, terminal=True, epoch_ended=True/False
                if epoch_ended and not (terminal):
                    print('Warning: trajectory cut off by epoch at %d steps.' %
                          ep_len,
                          flush=True)
                # if trajectory didn't reach terminal state, bootstrap value target
                if timeout or epoch_ended:
                    _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32))
                else:
                    v = 0
                buf.finish_path(v)
                if terminal:
                    logger.store(EpRet=ep_ret, EpLen=ep_len)
                o, ep_ret, ep_len = env.reset(), 0, 0  # 重新初始化

        # Save model
        if (epoch % save_freq == 0) or (epoch == epochs - 1):
            logger.save_state({'env': env}, None)

        # Perform VPG update!
        update(buf, ac, train_v_iters, pi_optimizer, vf_optimizer, logger)

        # # Log info about epoch
        logger.log_tabular('Epoch', epoch)
        logger.log_tabular('EpRet', with_min_and_max=True)
        logger.log_tabular('EpLen', average_only=True)
        logger.log_tabular('VVals', with_min_and_max=True)
        logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch)
        logger.log_tabular('LossPi', average_only=True)
        logger.log_tabular('LossV', average_only=True)
        logger.log_tabular('DeltaLossPi', average_only=True)
        logger.log_tabular('DeltaLossV', average_only=True)
        logger.log_tabular('Entropy', average_only=True)
        logger.log_tabular('KL', average_only=True)
        logger.log_tabular('Time', time.time() - start_time)
        logger.dump_tabular()
Example #4
0
def main(Test='NoRTA', TrainingCases=['NoRTA'], RANGE=1000, ac_kwargs=dict(hidden_sizes=[64,64])):

	"""
	Test: Test Case
		'NoRTA', 'SVL', 'SBSF', or 'ASIF'
	Train: Training Case
		'NoRTA', 'NoRTAHP', 'SVL', 'SBSF', or 'ASIF'
	RANGE: Test Range (m)
		1000 or 10000
	ac_kwargs: Neural network parameters
		dictionary with hidden layers, ex: dict(hidden_sizes=[64,64]
	NN MODELS (below): Saved trained models
	"""

	##### NN MODELS #####
	NoRTA_model = "NoRTA2.dat"
	NoRTAHP_model = "NoRTAHP2.dat"
	SVL_model = "Velocity2.dat"
	SBSF_model = "ISimplex2.dat"
	ASIF_model = "IASIF2.dat"
	#####################

	env = gym.make('spacecraft-docking-continuous-v0')

	# Defines test points
	if RANGE == 10000:
		Distance = [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
	elif RANGE == 1000:
		Distance = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

	Angle = [1.57, 5.5, 4.71, 2.36, 3.93, 0.79, 1.18, 3.14, 4.32, 0]
	Vx = [-0.1, -0.25, 0.25, 0.1, -0.5, 0.5, -0.75, 0.75, -1, 1]
	Vy = [0.1, -0.1, -0.25, 0.25, 0.5, -0.5, -0.75, 1, 0.75, -1]

	# Import ASIF
	if Test == 'SVL':
		from Simple_velocity_limit import RTA
	elif Test == 'SBSF':
		from ISimplex import RTA
	elif Test == 'ASIF':
		from IASIF import RTA

	if Test == 'SVL' or Test == 'SBSF' or Test == 'ASIF':
		# Call ASIF class
		rta = RTA(env)

		# Define action
		def RTA_act(obs, act):
			# Clip action to be within accepted range
			act = np.clip(act, -env.force_magnitude, env.force_magnitude)
			# Rearrange observation state vector
			x0 = [obs[0], obs[1], 0, obs[2], obs[3], 0]
			# Rearrange action vector
			u_des = np.array([[act[0]], [act[1]], [0]])
			# Call asif function
			u = rta.main(x0, u_des)
			# Extract relevant data
			new_act = [u[0,0], u[1,0]]
			# Determine if RTA adjusted action
			if np.sqrt((act[0] - new_act[0])**2 + (act[1] - new_act[1])**2) < 0.0001:
				# Set flag for tracking/reward function
				env.RTA_on = False
			else:
				env.RTA_on = True
			# Return new action
			return new_act

	for Train in TrainingCases:
		# Load neural network
		ac = core.MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs)
		# Load appropriate model
		if Train == 'NoRTA':
			ac.load_state_dict(torch.load(f"{PATH}/{NoRTA_model}"))
		elif Train == 'NoRTAHP':
			ac.load_state_dict(torch.load(f"{PATH}/{NoRTAHP_model}"))
		elif Train == 'SVL':
			ac.load_state_dict(torch.load(f"{PATH}/{SVL_model}"))
		elif Train == 'SBSF':
			ac.load_state_dict(torch.load(f"{PATH}/{SBSF_model}"))
		elif Train == 'ASIF':
			ac.load_state_dict(torch.load(f"{PATH}/{ASIF_model}"))

		# Use best action (mean of policy's probability distribution)
		def get_best_action(obs):
			with torch.no_grad():
				act = ac.pi.mu_net(torch.as_tensor(obs, dtype=torch.float32)).numpy()
			return act

		# Set variables
		env.termination_condition = True # Prints cause of termination
		RTA_percent = 0 # Tracks percentage of time RTA is on
		steps = 0 # Tracks number of steps

		# for 10 test points
		for i2 in range(len(Distance)):
			# Reset variables
			done = False
			env.reset()
			# Used to track trajectories for plots
			rH = []
			vH = []
			x = []
			y = []

			# Reset environment conditions for each test case
			theta = Angle[i2]
			env.position_deputy = Distance[i2]
			env.x_deputy = env.position_deputy*math.cos(theta)
			env.y_deputy = env.position_deputy*math.sin(theta)
			x_dot = Vx[i2]
			y_dot = Vy[i2]
			env.rH = env.position_deputy
			env.state = np.array([env.x_deputy, env.y_deputy, x_dot, y_dot])
			obs = env.state
			env.x_threshold = 1.5 * env.position_deputy
			env.y_threshold = 1.5 * env.position_deputy
			if RANGE == 1000 and Test != 'SBSF':
				env.max_control = 750

			# Run episode
			while not done:
				# Get best action
				act = get_best_action(obs)
				# Pass through RTA
				if Test == 'SVL' or Test == 'ASIF' or Test == 'SBSF':
					act = RTA_act(obs,act)
				# Take step in environment
				obs, _, done, _ = env.step(act)
				# Track if velocity violated constraint (No RTA)
				if Test == 'NoRTA':
					over_max_vel, _, _ = env.check_velocity(act[0], act[1])
					if over_max_vel:
						RTA_percent += 1
				# Track if RTA is on
				elif Test == 'SVL' or Test == 'ASIF' or Test == 'SBSF':
					if env.RTA_on:
						RTA_percent += 1
				steps += 1

				# Track for plotting
				rH.append(env.rH)
				vH.append(env.vH)
				x.append(obs[0])
				y.append(obs[1])

			# Plot trajectories
			plt.figure(1)
			if Train == 'NoRTAHP':
				dash = 'r'
			elif Train == 'NoRTA':
				dash = 'darkorange'
			elif Train == 'SVL':
				dash = 'b'
			elif Train == 'SBSF':
				dash = 'lime'
			elif Train == 'ASIF':
				dash = 'm'
			plt.plot(rH,vH,dash)
			plt.figure(2)
			plt.plot(x,y,dash)

		# Print RTA on percentage
		print(f"{Train} Average RTA % On: {RTA_percent/steps*100:.1f} %")

	# Plot setup
	plt.figure(1)
	plt.plot([0, 10000],[0.2, 20.74], '--', color='black',label='Max Velocity Limit')
	plt.plot([0, 10000],[-0.2, 4.935], '--', color='coral',label='Min Velocity Limit')
	# plt.title('Velocity vs. Position')
	if RANGE == 1000:
		plt.ylim([0, 2.5])
		plt.xlim([0, 1200])
	elif RANGE == 10000:
		plt.xlim([0, 10000])
		plt.ylim([0, 20])
	plt.xlabel('Distance from Chief (m)')
	plt.ylabel('Relative Velocity (m/s)')
	# plt.legend()
	plt.grid(True)

	plt.figure(2)
	# plt.title('Trajectories')
	if RANGE == 1000:
		plt.xlim([-1200, 1200])
		plt.ylim([-1200, 1200])
	elif RANGE == 10000:
		plt.xlim([-11000, 11000])
		plt.ylim([-11000, 11000])
	plt.plot(0,0,'k*', ms=10)
	plt.grid(True)
	plt.xlabel('X position (m)')
	plt.ylabel('Y position (m)')

	plt.figure(3)
	plt.plot(0,0,color='r', linewidth=2)
	plt.plot(0,0,color='darkorange', linewidth=2)
	plt.plot(0,0,color='b', linewidth=2)
	plt.plot(0,0,color='lime', linewidth=2)
	plt.plot(0,0,color='m', linewidth=2)
	plt.plot(0,0,'--',color='black')
	plt.plot(0,0,'--',color='coral')
	plt.axis('off')
	plt.legend(['Training with No RTA - HP','Training with No RTA','Training with SVL','Training with SBSF','Training with ASIF','Max Velocity Limit','Min Velocity Limit'], loc='upper center')

	plt.show()
Example #5
0
    # Make simulation environment
    env_fn = lambda: gym.make(args.env)
    env, test_env = env_fn(), env_fn()

    # Limit test environment steps in case agent is stuck
    test_env._max_episode_steps = 800

    # Save videos each testing iteration
    test_env = gym.wrappers.Monitor(test_env, "test-recordings", force=True)

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]

    # Instantiate Actor Critic Neural Net and Target Network
    net = core.MLPActorCritic(env.observation_space, env.action_space)
    targ_net = deepcopy(net)

    # Freeze target network
    for p in targ_net.parameters():
        p.requires_grad = False

    # Experience / Memory Buffer
    replay_buffer = core.ReplayBuffer(obs_dim=obs_dim,
                                      act_dim=act_dim,
                                      size=int(1e6))

    # Count number of parameters in network
    param_counts = tuple(core.count_vars(module) for module in [net.pi, net.q])
    logger.log('\nNumber of Parameters: \t pi: %d, \t q: %d\n' % param_counts)
    # Set up optimization functions for policy and q-function