Beispiel #1
0
    def __init__(self,
                 normalizer,
                 policy,
                 env,
                 smach=None,
                 TGP=None,
                 spot=None,
                 gui=False):
        self.normalizer = normalizer
        self.policy = policy
        self.state_dim = self.policy.state_dim
        self.action_dim = self.policy.action_dim
        self.env = env
        self.max_action = float(self.env.action_space.high[0])
        self.successes = 0
        self.phase = 0
        self.desired_velocity = 0.5
        self.desired_rate = 0.0
        self.flip = 0
        self.increment = 0
        self.scaledown = True
        self.type = "Stop"
        self.smach = smach
        if smach is not None:
            self.BaseClearanceHeight = self.smach.ClearanceHeight
            self.BasePenetrationDepth = self.smach.PenetrationDepth
        self.TGP = TGP
        self.spot = spot
        if gui:
            self.g_u_i = GUI(self.env.spot.quadruped)
        else:
            self.g_u_i = None

        self.action_history = []
        self.true_action_history = []
Beispiel #2
0
def main():
    """ The main() function. """

    print("STARTING SPOT TEST ENV")
    seed = 0
    max_timesteps = 4e6
    file_name = "spot_ars_"

    # Find abs path to this file
    my_path = os.path.abspath(os.path.dirname(__file__))
    results_path = os.path.join(my_path, "../results")
    models_path = os.path.join(my_path, "../models")

    if not os.path.exists(results_path):
        os.makedirs(results_path)

    if not os.path.exists(models_path):
        os.makedirs(models_path)

    env = spotGymEnv(render=True, on_rack=False)

    # Set seeds
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    print("STATE DIM: {}".format(state_dim))
    action_dim = env.action_space.shape[0]
    print("ACTION DIM: {}".format(action_dim))
    max_action = float(env.action_space.high[0])

    env.reset()

    g_u_i = GUI()

    spot = SpotModel()
    T_bf = spot.WorldToFoot

    print("STARTED SPOT TEST ENV")
    t = 0
    while t < (int(max_timesteps)):

        # GUI: x, y, z | r, p , y
        pos, orn, _, _, _, _ = g_u_i.UserInput()
        # Get Roll, Pitch, Yaw
        joint_angles = spot.IK(orn, pos, T_bf)
        action = joint_angles.reshape(-1)
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)

        # time.sleep(1.0)

        t += 1
    env.close()
    print(joint_angles)
Beispiel #3
0
def main():
    """ The main() function. """

    print("STARTING SPOT TEST ENV")
    seed = 0
    max_timesteps = 4e6

    # Find abs path to this file
    my_path = os.path.abspath(os.path.dirname(__file__))
    results_path = os.path.join(my_path, "../results")
    models_path = os.path.join(my_path, "../models")

    if not os.path.exists(results_path):
        os.makedirs(results_path)

    if not os.path.exists(models_path):
        os.makedirs(models_path)

    if ARGS.DebugRack:
        on_rack = True
    else:
        on_rack = False

    if ARGS.DebugPath:
        draw_foot_path = True
    else:
        draw_foot_path = False

    if ARGS.HeightField:
        height_field = True
    else:
        height_field = False

    env = spotBezierEnv(render=True,
                        on_rack=on_rack,
                        height_field=height_field,
                        draw_foot_path=draw_foot_path)

    # Set seeds
    env.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    print("STATE DIM: {}".format(state_dim))
    action_dim = env.action_space.shape[0]
    print("ACTION DIM: {}".format(action_dim))
    max_action = float(env.action_space.high[0])

    state = env.reset()

    g_u_i = GUI(env.spot.quadruped)

    spot = SpotModel()
    T_bf0 = spot.WorldToFoot
    T_bf = copy.deepcopy(T_bf0)

    bzg = BezierGait(dt=env._time_step)

    bz_step = BezierStepper(dt=env._time_step, mode=0)

    action = env.action_space.sample()

    FL_phases = []
    FR_phases = []
    BL_phases = []
    BR_phases = []
    yaw = 0.0

    print("STARTED SPOT TEST ENV")
    t = 0
    while t < (int(max_timesteps)):

        bz_step.ramp_up()

        pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth = bz_step.StateMachine(
        )

        pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth = g_u_i.UserInput(
        )

        yaw = env.return_yaw()

        P_yaw = 5.0

        if ARGS.AutoYaw:
            YawRate += -yaw * P_yaw

        # print("YAW RATE: {}".format(YawRate))

        # TEMP
        bz_step.StepLength = StepLength
        bz_step.LateralFraction = LateralFraction
        bz_step.YawRate = YawRate
        bz_step.StepVelocity = StepVelocity

        contacts = state[-4:]

        FL_phases.append(env.spot.LegPhases[0])
        FR_phases.append(env.spot.LegPhases[1])
        BL_phases.append(env.spot.LegPhases[2])
        BR_phases.append(env.spot.LegPhases[3])

        # Get Desired Foot Poses
        T_bf = bzg.GenerateTrajectory(StepLength, LateralFraction, YawRate,
                                      StepVelocity, T_bf0, T_bf,
                                      ClearanceHeight, PenetrationDepth,
                                      contacts)
        joint_angles = spot.IK(orn, pos, T_bf)
        env.pass_joint_angles(joint_angles.reshape(-1))
        # Get External Observations
        env.spot.GetExternalObservations(bzg, bz_step)
        # Step
        state, reward, done, _ = env.step(action)
        if done:
            print("DONE")
            if ARGS.AutoReset:
                env.reset()
            # plt.plot()
            # plt.plot(FL_phases, label="FL")
            # plt.plot(FR_phases, label="FR")
            # plt.plot(BL_phases, label="BL")
            # plt.plot(BR_phases, label="BR")
            # plt.xlabel("dt")
            # plt.ylabel("value")
            # plt.title("Leg Phases")
            # plt.legend()
            # plt.show()

        # time.sleep(1.0)

        t += 1
    env.close()
    print(joint_angles)
Beispiel #4
0
def main():
    """ The main() function. """
    # Hold mp pipes
    mp.freeze_support()

    print("STARTING SPOT TRAINING ENV")
    seed = 0
    max_timesteps = 4e6
    eval_freq = 1e1
    save_model = True
    file_name = "spot_ars_"

    if ARGS.HeightField:
        height_field = True
    else:
        height_field = False

    if ARGS.NoContactSensing:
        contacts = False
    else:
        contacts = True

    # Find abs path to this file
    my_path = os.path.abspath(os.path.dirname(__file__))
    results_path = os.path.join(my_path, "../results")
    if contacts:
        models_path = os.path.join(my_path, "../models/contact")
    else:
        models_path = os.path.join(my_path, "../models/no_contact")

    if not os.path.exists(results_path):
        os.makedirs(results_path)

    if not os.path.exists(models_path):
        os.makedirs(models_path)

    env = spotBezierEnv(render=False,
                        on_rack=False,
                        height_field=height_field,
                        draw_foot_path=False,
                        contacts=contacts)

    # Set seeds
    env.seed(seed)
    np.random.seed(seed)

    state_dim = env.observation_space.shape[0]
    print("STATE DIM: {}".format(state_dim))
    action_dim = env.action_space.shape[0]
    print("ACTION DIM: {}".format(action_dim))
    max_action = float(env.action_space.high[0])

    env.reset()

    g_u_i = GUI(env.spot.quadruped)

    spot = SpotModel()
    T_bf = spot.WorldToFoot

    bz_step = BezierStepper(dt=env._time_step)
    bzg = BezierGait(dt=env._time_step)

    # Initialize Normalizer
    normalizer = Normalizer(state_dim)

    # Initialize Policy
    policy = Policy(state_dim, action_dim)

    # Initialize Agent with normalizer, policy and gym env
    agent = ARSAgent(normalizer, policy, env, bz_step, bzg, spot)
    agent_num = 0
    if os.path.exists(models_path + "/" + file_name + str(agent_num) +
                      "_policy"):
        print("Loading Existing agent")
        agent.load(models_path + "/" + file_name + str(agent_num))

    env.reset(agent.desired_velocity, agent.desired_rate)

    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0

    # Create mp pipes
    num_processes = policy.num_deltas
    processes = []
    childPipes = []
    parentPipes = []

    # Store mp pipes
    for pr in range(num_processes):
        parentPipe, childPipe = Pipe()
        parentPipes.append(parentPipe)
        childPipes.append(childPipe)

    # Start multiprocessing
    # Start multiprocessing
    for proc_num in range(num_processes):
        p = mp.Process(target=ParallelWorker,
                       args=(childPipes[proc_num], env, state_dim))
        p.start()
        processes.append(p)

    print("STARTED SPOT TRAINING ENV")
    t = 0
    while t < (int(max_timesteps)):

        # Maximum timesteps per rollout

        episode_reward, episode_timesteps = agent.train_parallel(parentPipes)
        t += episode_timesteps
        # episode_reward = agent.train()
        # +1 to account for 0 indexing.
        # +0 on ep_timesteps since it will increment +1 even if done=True
        print(
            "Total T: {} Episode Num: {} Episode T: {} Reward: {:.2f} REWARD PER STEP: {:.2f}"
            .format(t + 1, episode_num, episode_timesteps, episode_reward,
                    episode_reward / float(episode_timesteps)))

        # Evaluate episode
        if (episode_num + 1) % eval_freq == 0:
            if save_model:
                agent.save(models_path + "/" + str(file_name) +
                           str(episode_num))
                # replay_buffer.save(t)

        episode_num += 1

    # Close pipes and hence envs
    for parentPipe in parentPipes:
        parentPipe.send([_CLOSE, "pay2"])

    for p in processes:
        p.join()
Beispiel #5
0
class ARSAgent():
    def __init__(self,
                 normalizer,
                 policy,
                 env,
                 smach=None,
                 TGP=None,
                 spot=None,
                 gui=False):
        self.normalizer = normalizer
        self.policy = policy
        self.state_dim = self.policy.state_dim
        self.action_dim = self.policy.action_dim
        self.env = env
        self.max_action = float(self.env.action_space.high[0])
        self.successes = 0
        self.phase = 0
        self.desired_velocity = 0.5
        self.desired_rate = 0.0
        self.flip = 0
        self.increment = 0
        self.scaledown = True
        self.type = "Stop"
        self.smach = smach
        if smach is not None:
            self.BaseClearanceHeight = self.smach.ClearanceHeight
            self.BasePenetrationDepth = self.smach.PenetrationDepth
        self.TGP = TGP
        self.spot = spot
        if gui:
            self.g_u_i = GUI(self.env.spot.quadruped)
        else:
            self.g_u_i = None

        self.action_history = []
        self.true_action_history = []

    # Deploy Policy in one direction over one whole episode
    # DO THIS ONCE PER ROLLOUT OR DURING DEPLOYMENT
    def deploy(self, direction=None, delta=None):
        state = self.env.reset(self.desired_velocity, self.desired_rate)
        sum_rewards = 0.0
        timesteps = 0
        done = False
        while not done and timesteps < self.policy.episode_steps:
            # print("STATE: ", state)
            # print("dt: {}".format(timesteps))
            self.normalizer.observe(state)
            # Normalize State
            state = self.normalizer.normalize(state)
            action = self.policy.evaluate(state, delta, direction)
            # Clip action between +-1 for execution in env
            for a in range(len(action)):
                action[a] = np.clip(action[a], -self.max_action,
                                    self.max_action)
            # print("ACTION: ", action)
            state, reward, done, _ = self.env.step(action)
            # print("STATE: ", state)
            # Clip reward between -1 and 1 to prevent outliers from
            # distorting weights
            reward = np.clip(reward, -self.max_action, self.max_action)
            sum_rewards += reward
            timesteps += 1
            # Divide rewards by timesteps for reward-per-step + exp survive rwd
        return (sum_rewards + timesteps**cum_dt_exp) / timesteps

    # Deploy Policy in one direction over one whole episode
    # DO THIS ONCE PER ROLLOUT OR DURING DEPLOYMENT
    def deployTG(self, direction=None, delta=None):
        state = self.env.reset()
        sum_rewards = 0.0
        timesteps = 0
        done = False
        # alpha = []
        # h = []
        # f = []
        T_bf = copy.deepcopy(self.spot.WorldToFoot)
        T_b0 = copy.deepcopy(self.spot.WorldToFoot)
        self.action_history = []
        self.true_action_history = []
        action = self.env.action_space.sample()
        action[:] = 0.0
        old_act = action[:actions_to_filter]
        # For auto yaw correction
        yaw = 0.0
        while not done and timesteps < self.policy.episode_steps:
            # self.smach.ramp_up()
            pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth = self.smach.StateMachine(
            )

            if self.g_u_i:
                pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth, SwingPeriod = self.g_u_i.UserInput(
                )
                self.TGP.Tswing = SwingPeriod

            self.env.spot.GetExternalObservations(self.TGP, self.smach)

            # Read UPDATED state based on controls and phase
            state = self.env.return_state()
            self.normalizer.observe(state)
            # Don't normalize contacts
            state = self.normalizer.normalize(state)
            action = self.policy.evaluate(state, delta, direction)

            # Action History
            self.action_history.append(np.tanh(action))

            # Action History
            true_action = copy.deepcopy(np.tanh(action))
            true_action[0] *= CD_SCALE
            true_action[1] = abs(true_action[1]) * Z_SCALE
            true_action[2:] *= RESIDUALS_SCALE
            self.true_action_history.append(true_action)

            action = np.tanh(action)

            # EXP FILTER
            action[:actions_to_filter] = alpha * old_act + (
                1.0 - alpha) * action[:actions_to_filter]
            old_act = action[:actions_to_filter]

            ClearanceHeight += action[0] * CD_SCALE

            # CLIP EVERYTHING
            StepLength = np.clip(StepLength, self.smach.StepLength_LIMITS[0],
                                 self.smach.StepLength_LIMITS[1])
            StepVelocity = np.clip(StepVelocity,
                                   self.smach.StepVelocity_LIMITS[0],
                                   self.smach.StepVelocity_LIMITS[1])
            LateralFraction = np.clip(LateralFraction,
                                      self.smach.LateralFraction_LIMITS[0],
                                      self.smach.LateralFraction_LIMITS[1])
            YawRate = np.clip(YawRate, self.smach.YawRate_LIMITS[0],
                              self.smach.YawRate_LIMITS[1])
            ClearanceHeight = np.clip(ClearanceHeight,
                                      self.smach.ClearanceHeight_LIMITS[0],
                                      self.smach.ClearanceHeight_LIMITS[1])
            PenetrationDepth = np.clip(PenetrationDepth,
                                       self.smach.PenetrationDepth_LIMITS[0],
                                       self.smach.PenetrationDepth_LIMITS[1])

            contacts = copy.deepcopy(state[-4:])
            # contacts = [0, 0, 0, 0]

            # print("CONTACTS: {}".format(contacts))

            yaw = self.env.return_yaw()
            if not self.g_u_i:
                YawRate += -yaw * P_yaw

            # Get Desired Foot Poses
            if timesteps > 20:
                T_bf = self.TGP.GenerateTrajectory(StepLength, LateralFraction,
                                                   YawRate, StepVelocity, T_b0,
                                                   T_bf, ClearanceHeight,
                                                   PenetrationDepth, contacts)
            else:
                T_bf = self.TGP.GenerateTrajectory(0.0, 0.0, 0.0, 0.1, T_b0,
                                                   T_bf, ClearanceHeight,
                                                   PenetrationDepth, contacts)
                action[:] = 0.0

            action[2:] *= RESIDUALS_SCALE

            # Add DELTA to XYZ Foot Poses
            T_bf_copy = copy.deepcopy(T_bf)
            T_bf_copy["FL"][:3, 3] += action[2:5]
            T_bf_copy["FR"][:3, 3] += action[5:8]
            T_bf_copy["BL"][:3, 3] += action[8:11]
            T_bf_copy["BR"][:3, 3] += action[11:14]

            # Adjust Height!
            pos[2] += abs(action[1]) * Z_SCALE

            joint_angles = self.spot.IK(orn, pos, T_bf_copy)
            # Pass Joint Angles
            self.env.pass_joint_angles(joint_angles.reshape(-1))

            # Perform action
            next_state, reward, done, _ = self.env.step(action)
            sum_rewards += reward
            timesteps += 1

        self.TGP.reset()
        self.smach.reshuffle()
        self.smach.PenetrationDepth = self.BasePenetrationDepth
        self.smach.ClearanceHeight = self.BaseClearanceHeight
        return sum_rewards, timesteps

    def returnPose(self):
        return self.env.spot.GetBasePosition()

    def train(self):
        # Sample random expl_noise deltas
        print("-------------------------------")
        # print("Sampling Deltas")
        deltas = self.policy.sample_deltas()
        # Initialize +- reward list of size num_deltas
        positive_rewards = [0] * self.policy.num_deltas
        negative_rewards = [0] * self.policy.num_deltas

        # Execute 2*num_deltas rollouts and store +- rewards
        print("Deploying Rollouts")
        for i in range(self.policy.num_deltas):
            print("Rollout #{}".format(i + 1))
            positive_rewards[i] = self.deploy(direction="+", delta=deltas[i])
            negative_rewards[i] = self.deploy(direction="-", delta=deltas[i])

        # Calculate std dev
        std_dev_rewards = np.array(positive_rewards + negative_rewards).std()

        # Order rollouts in decreasing list using cum reward as criterion
        unsorted_rollouts = [(positive_rewards[i], negative_rewards[i],
                              deltas[i])
                             for i in range(self.policy.num_deltas)]
        # When sorting, take the max between the reward for +- disturbance
        sorted_rollouts = sorted(
            unsorted_rollouts,
            key=lambda x: max(unsorted_rollouts[0], unsorted_rollouts[1]),
            reverse=True)

        # Only take first best_num_deltas rollouts
        rollouts = sorted_rollouts[:self.policy.num_best_deltas]

        # Update Policy
        self.policy.update(rollouts, std_dev_rewards)

        # Execute Current Policy
        eval_reward = self.deploy()
        return eval_reward

    def train_parallel(self, parentPipes):
        """ Execute rollouts in parallel using multiprocessing library
            based on: # https://github.com/bulletphysics/bullet3/blob/master/examples/pybullet/gym/pybullet_envs/ARS/ars.py
        """
        # USE VANILLA OR TG POLICY
        if self.TGP is None:
            exploration = _EXPLORE
        else:
            exploration = _EXPLORE_TG

        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = self.policy.sample_deltas()
        # Initialize +- reward list of size num_deltas
        positive_rewards = [0] * self.policy.num_deltas
        negative_rewards = [0] * self.policy.num_deltas

        smach = copy.deepcopy(self.smach)
        smach.ClearanceHeight = self.BaseClearanceHeight
        smach.PenetrationDepth = self.BasePenetrationDepth
        smach.reshuffle()

        if parentPipes:
            for i in range(self.policy.num_deltas):
                # Execute each rollout on a separate thread
                parentPipe = parentPipes[i]
                # NOTE: target for parentPipe specified in main_ars.py
                # (target is ParallelWorker fcn defined up top)
                parentPipe.send([
                    exploration,
                    [
                        self.normalizer, self.policy, "+", deltas[i],
                        self.desired_velocity, self.desired_rate, self.TGP,
                        smach, self.spot
                    ]
                ])
            for i in range(self.policy.num_deltas):
                # Receive cummulative reward from each rollout
                positive_rewards[i] = parentPipes[i].recv()[0]

            for i in range(self.policy.num_deltas):
                # Execute each rollout on a separate thread
                parentPipe = parentPipes[i]
                parentPipe.send([
                    exploration,
                    [
                        self.normalizer, self.policy, "-", deltas[i],
                        self.desired_velocity, self.desired_rate, self.TGP,
                        smach, self.spot
                    ]
                ])
            for i in range(self.policy.num_deltas):
                # Receive cummulative reward from each rollout
                negative_rewards[i] = parentPipes[i].recv()[0]

        else:
            raise ValueError(
                "Select 'train' method if you are not using multiprocessing!")

        # Calculate std dev
        std_dev_rewards = np.array(positive_rewards + negative_rewards).std()

        # Order rollouts in decreasing list using cum reward as criterion
        # take max between reward for +- disturbance as that rollout's reward
        # Store max between positive and negative reward as key for sort
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        indeces = sorted(scores.keys(), key=lambda x: scores[x],
                         reverse=True)[:self.policy.num_deltas]
        # print("INDECES: ", indeces)
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in indeces]

        # Update Policy
        self.policy.update(rollouts, std_dev_rewards)

        # Execute Current Policy USING VANILLA OR TG
        if self.TGP is None:
            return self.deploy()
        else:
            return self.deployTG()

    def save(self, filename):
        """ Save the Policy

        :param filename: the name of the file where the policy is saved
        """
        with open(filename + '_policy', 'wb') as filehandle:
            pickle.dump(self.policy.theta, filehandle)

    def load(self, filename):
        """ Load the Policy

        :param filename: the name of the file where the policy is saved
        """
        with open(filename + '_policy', 'rb') as filehandle:
            self.policy.theta = pickle.load(filehandle)