def __init__(self, normalizer, policy, env, smach=None, TGP=None, spot=None, gui=False): self.normalizer = normalizer self.policy = policy self.state_dim = self.policy.state_dim self.action_dim = self.policy.action_dim self.env = env self.max_action = float(self.env.action_space.high[0]) self.successes = 0 self.phase = 0 self.desired_velocity = 0.5 self.desired_rate = 0.0 self.flip = 0 self.increment = 0 self.scaledown = True self.type = "Stop" self.smach = smach if smach is not None: self.BaseClearanceHeight = self.smach.ClearanceHeight self.BasePenetrationDepth = self.smach.PenetrationDepth self.TGP = TGP self.spot = spot if gui: self.g_u_i = GUI(self.env.spot.quadruped) else: self.g_u_i = None self.action_history = [] self.true_action_history = []
def main(): """ The main() function. """ print("STARTING SPOT TEST ENV") seed = 0 max_timesteps = 4e6 file_name = "spot_ars_" # Find abs path to this file my_path = os.path.abspath(os.path.dirname(__file__)) results_path = os.path.join(my_path, "../results") models_path = os.path.join(my_path, "../models") if not os.path.exists(results_path): os.makedirs(results_path) if not os.path.exists(models_path): os.makedirs(models_path) env = spotGymEnv(render=True, on_rack=False) # Set seeds env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] print("STATE DIM: {}".format(state_dim)) action_dim = env.action_space.shape[0] print("ACTION DIM: {}".format(action_dim)) max_action = float(env.action_space.high[0]) env.reset() g_u_i = GUI() spot = SpotModel() T_bf = spot.WorldToFoot print("STARTED SPOT TEST ENV") t = 0 while t < (int(max_timesteps)): # GUI: x, y, z | r, p , y pos, orn, _, _, _, _ = g_u_i.UserInput() # Get Roll, Pitch, Yaw joint_angles = spot.IK(orn, pos, T_bf) action = joint_angles.reshape(-1) action = env.action_space.sample() next_state, reward, done, _ = env.step(action) # time.sleep(1.0) t += 1 env.close() print(joint_angles)
def main(): """ The main() function. """ print("STARTING SPOT TEST ENV") seed = 0 max_timesteps = 4e6 # Find abs path to this file my_path = os.path.abspath(os.path.dirname(__file__)) results_path = os.path.join(my_path, "../results") models_path = os.path.join(my_path, "../models") if not os.path.exists(results_path): os.makedirs(results_path) if not os.path.exists(models_path): os.makedirs(models_path) if ARGS.DebugRack: on_rack = True else: on_rack = False if ARGS.DebugPath: draw_foot_path = True else: draw_foot_path = False if ARGS.HeightField: height_field = True else: height_field = False env = spotBezierEnv(render=True, on_rack=on_rack, height_field=height_field, draw_foot_path=draw_foot_path) # Set seeds env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] print("STATE DIM: {}".format(state_dim)) action_dim = env.action_space.shape[0] print("ACTION DIM: {}".format(action_dim)) max_action = float(env.action_space.high[0]) state = env.reset() g_u_i = GUI(env.spot.quadruped) spot = SpotModel() T_bf0 = spot.WorldToFoot T_bf = copy.deepcopy(T_bf0) bzg = BezierGait(dt=env._time_step) bz_step = BezierStepper(dt=env._time_step, mode=0) action = env.action_space.sample() FL_phases = [] FR_phases = [] BL_phases = [] BR_phases = [] yaw = 0.0 print("STARTED SPOT TEST ENV") t = 0 while t < (int(max_timesteps)): bz_step.ramp_up() pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth = bz_step.StateMachine( ) pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth = g_u_i.UserInput( ) yaw = env.return_yaw() P_yaw = 5.0 if ARGS.AutoYaw: YawRate += -yaw * P_yaw # print("YAW RATE: {}".format(YawRate)) # TEMP bz_step.StepLength = StepLength bz_step.LateralFraction = LateralFraction bz_step.YawRate = YawRate bz_step.StepVelocity = StepVelocity contacts = state[-4:] FL_phases.append(env.spot.LegPhases[0]) FR_phases.append(env.spot.LegPhases[1]) BL_phases.append(env.spot.LegPhases[2]) BR_phases.append(env.spot.LegPhases[3]) # Get Desired Foot Poses T_bf = bzg.GenerateTrajectory(StepLength, LateralFraction, YawRate, StepVelocity, T_bf0, T_bf, ClearanceHeight, PenetrationDepth, contacts) joint_angles = spot.IK(orn, pos, T_bf) env.pass_joint_angles(joint_angles.reshape(-1)) # Get External Observations env.spot.GetExternalObservations(bzg, bz_step) # Step state, reward, done, _ = env.step(action) if done: print("DONE") if ARGS.AutoReset: env.reset() # plt.plot() # plt.plot(FL_phases, label="FL") # plt.plot(FR_phases, label="FR") # plt.plot(BL_phases, label="BL") # plt.plot(BR_phases, label="BR") # plt.xlabel("dt") # plt.ylabel("value") # plt.title("Leg Phases") # plt.legend() # plt.show() # time.sleep(1.0) t += 1 env.close() print(joint_angles)
def main(): """ The main() function. """ # Hold mp pipes mp.freeze_support() print("STARTING SPOT TRAINING ENV") seed = 0 max_timesteps = 4e6 eval_freq = 1e1 save_model = True file_name = "spot_ars_" if ARGS.HeightField: height_field = True else: height_field = False if ARGS.NoContactSensing: contacts = False else: contacts = True # Find abs path to this file my_path = os.path.abspath(os.path.dirname(__file__)) results_path = os.path.join(my_path, "../results") if contacts: models_path = os.path.join(my_path, "../models/contact") else: models_path = os.path.join(my_path, "../models/no_contact") if not os.path.exists(results_path): os.makedirs(results_path) if not os.path.exists(models_path): os.makedirs(models_path) env = spotBezierEnv(render=False, on_rack=False, height_field=height_field, draw_foot_path=False, contacts=contacts) # Set seeds env.seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] print("STATE DIM: {}".format(state_dim)) action_dim = env.action_space.shape[0] print("ACTION DIM: {}".format(action_dim)) max_action = float(env.action_space.high[0]) env.reset() g_u_i = GUI(env.spot.quadruped) spot = SpotModel() T_bf = spot.WorldToFoot bz_step = BezierStepper(dt=env._time_step) bzg = BezierGait(dt=env._time_step) # Initialize Normalizer normalizer = Normalizer(state_dim) # Initialize Policy policy = Policy(state_dim, action_dim) # Initialize Agent with normalizer, policy and gym env agent = ARSAgent(normalizer, policy, env, bz_step, bzg, spot) agent_num = 0 if os.path.exists(models_path + "/" + file_name + str(agent_num) + "_policy"): print("Loading Existing agent") agent.load(models_path + "/" + file_name + str(agent_num)) env.reset(agent.desired_velocity, agent.desired_rate) episode_reward = 0 episode_timesteps = 0 episode_num = 0 # Create mp pipes num_processes = policy.num_deltas processes = [] childPipes = [] parentPipes = [] # Store mp pipes for pr in range(num_processes): parentPipe, childPipe = Pipe() parentPipes.append(parentPipe) childPipes.append(childPipe) # Start multiprocessing # Start multiprocessing for proc_num in range(num_processes): p = mp.Process(target=ParallelWorker, args=(childPipes[proc_num], env, state_dim)) p.start() processes.append(p) print("STARTED SPOT TRAINING ENV") t = 0 while t < (int(max_timesteps)): # Maximum timesteps per rollout episode_reward, episode_timesteps = agent.train_parallel(parentPipes) t += episode_timesteps # episode_reward = agent.train() # +1 to account for 0 indexing. # +0 on ep_timesteps since it will increment +1 even if done=True print( "Total T: {} Episode Num: {} Episode T: {} Reward: {:.2f} REWARD PER STEP: {:.2f}" .format(t + 1, episode_num, episode_timesteps, episode_reward, episode_reward / float(episode_timesteps))) # Evaluate episode if (episode_num + 1) % eval_freq == 0: if save_model: agent.save(models_path + "/" + str(file_name) + str(episode_num)) # replay_buffer.save(t) episode_num += 1 # Close pipes and hence envs for parentPipe in parentPipes: parentPipe.send([_CLOSE, "pay2"]) for p in processes: p.join()
class ARSAgent(): def __init__(self, normalizer, policy, env, smach=None, TGP=None, spot=None, gui=False): self.normalizer = normalizer self.policy = policy self.state_dim = self.policy.state_dim self.action_dim = self.policy.action_dim self.env = env self.max_action = float(self.env.action_space.high[0]) self.successes = 0 self.phase = 0 self.desired_velocity = 0.5 self.desired_rate = 0.0 self.flip = 0 self.increment = 0 self.scaledown = True self.type = "Stop" self.smach = smach if smach is not None: self.BaseClearanceHeight = self.smach.ClearanceHeight self.BasePenetrationDepth = self.smach.PenetrationDepth self.TGP = TGP self.spot = spot if gui: self.g_u_i = GUI(self.env.spot.quadruped) else: self.g_u_i = None self.action_history = [] self.true_action_history = [] # Deploy Policy in one direction over one whole episode # DO THIS ONCE PER ROLLOUT OR DURING DEPLOYMENT def deploy(self, direction=None, delta=None): state = self.env.reset(self.desired_velocity, self.desired_rate) sum_rewards = 0.0 timesteps = 0 done = False while not done and timesteps < self.policy.episode_steps: # print("STATE: ", state) # print("dt: {}".format(timesteps)) self.normalizer.observe(state) # Normalize State state = self.normalizer.normalize(state) action = self.policy.evaluate(state, delta, direction) # Clip action between +-1 for execution in env for a in range(len(action)): action[a] = np.clip(action[a], -self.max_action, self.max_action) # print("ACTION: ", action) state, reward, done, _ = self.env.step(action) # print("STATE: ", state) # Clip reward between -1 and 1 to prevent outliers from # distorting weights reward = np.clip(reward, -self.max_action, self.max_action) sum_rewards += reward timesteps += 1 # Divide rewards by timesteps for reward-per-step + exp survive rwd return (sum_rewards + timesteps**cum_dt_exp) / timesteps # Deploy Policy in one direction over one whole episode # DO THIS ONCE PER ROLLOUT OR DURING DEPLOYMENT def deployTG(self, direction=None, delta=None): state = self.env.reset() sum_rewards = 0.0 timesteps = 0 done = False # alpha = [] # h = [] # f = [] T_bf = copy.deepcopy(self.spot.WorldToFoot) T_b0 = copy.deepcopy(self.spot.WorldToFoot) self.action_history = [] self.true_action_history = [] action = self.env.action_space.sample() action[:] = 0.0 old_act = action[:actions_to_filter] # For auto yaw correction yaw = 0.0 while not done and timesteps < self.policy.episode_steps: # self.smach.ramp_up() pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth = self.smach.StateMachine( ) if self.g_u_i: pos, orn, StepLength, LateralFraction, YawRate, StepVelocity, ClearanceHeight, PenetrationDepth, SwingPeriod = self.g_u_i.UserInput( ) self.TGP.Tswing = SwingPeriod self.env.spot.GetExternalObservations(self.TGP, self.smach) # Read UPDATED state based on controls and phase state = self.env.return_state() self.normalizer.observe(state) # Don't normalize contacts state = self.normalizer.normalize(state) action = self.policy.evaluate(state, delta, direction) # Action History self.action_history.append(np.tanh(action)) # Action History true_action = copy.deepcopy(np.tanh(action)) true_action[0] *= CD_SCALE true_action[1] = abs(true_action[1]) * Z_SCALE true_action[2:] *= RESIDUALS_SCALE self.true_action_history.append(true_action) action = np.tanh(action) # EXP FILTER action[:actions_to_filter] = alpha * old_act + ( 1.0 - alpha) * action[:actions_to_filter] old_act = action[:actions_to_filter] ClearanceHeight += action[0] * CD_SCALE # CLIP EVERYTHING StepLength = np.clip(StepLength, self.smach.StepLength_LIMITS[0], self.smach.StepLength_LIMITS[1]) StepVelocity = np.clip(StepVelocity, self.smach.StepVelocity_LIMITS[0], self.smach.StepVelocity_LIMITS[1]) LateralFraction = np.clip(LateralFraction, self.smach.LateralFraction_LIMITS[0], self.smach.LateralFraction_LIMITS[1]) YawRate = np.clip(YawRate, self.smach.YawRate_LIMITS[0], self.smach.YawRate_LIMITS[1]) ClearanceHeight = np.clip(ClearanceHeight, self.smach.ClearanceHeight_LIMITS[0], self.smach.ClearanceHeight_LIMITS[1]) PenetrationDepth = np.clip(PenetrationDepth, self.smach.PenetrationDepth_LIMITS[0], self.smach.PenetrationDepth_LIMITS[1]) contacts = copy.deepcopy(state[-4:]) # contacts = [0, 0, 0, 0] # print("CONTACTS: {}".format(contacts)) yaw = self.env.return_yaw() if not self.g_u_i: YawRate += -yaw * P_yaw # Get Desired Foot Poses if timesteps > 20: T_bf = self.TGP.GenerateTrajectory(StepLength, LateralFraction, YawRate, StepVelocity, T_b0, T_bf, ClearanceHeight, PenetrationDepth, contacts) else: T_bf = self.TGP.GenerateTrajectory(0.0, 0.0, 0.0, 0.1, T_b0, T_bf, ClearanceHeight, PenetrationDepth, contacts) action[:] = 0.0 action[2:] *= RESIDUALS_SCALE # Add DELTA to XYZ Foot Poses T_bf_copy = copy.deepcopy(T_bf) T_bf_copy["FL"][:3, 3] += action[2:5] T_bf_copy["FR"][:3, 3] += action[5:8] T_bf_copy["BL"][:3, 3] += action[8:11] T_bf_copy["BR"][:3, 3] += action[11:14] # Adjust Height! pos[2] += abs(action[1]) * Z_SCALE joint_angles = self.spot.IK(orn, pos, T_bf_copy) # Pass Joint Angles self.env.pass_joint_angles(joint_angles.reshape(-1)) # Perform action next_state, reward, done, _ = self.env.step(action) sum_rewards += reward timesteps += 1 self.TGP.reset() self.smach.reshuffle() self.smach.PenetrationDepth = self.BasePenetrationDepth self.smach.ClearanceHeight = self.BaseClearanceHeight return sum_rewards, timesteps def returnPose(self): return self.env.spot.GetBasePosition() def train(self): # Sample random expl_noise deltas print("-------------------------------") # print("Sampling Deltas") deltas = self.policy.sample_deltas() # Initialize +- reward list of size num_deltas positive_rewards = [0] * self.policy.num_deltas negative_rewards = [0] * self.policy.num_deltas # Execute 2*num_deltas rollouts and store +- rewards print("Deploying Rollouts") for i in range(self.policy.num_deltas): print("Rollout #{}".format(i + 1)) positive_rewards[i] = self.deploy(direction="+", delta=deltas[i]) negative_rewards[i] = self.deploy(direction="-", delta=deltas[i]) # Calculate std dev std_dev_rewards = np.array(positive_rewards + negative_rewards).std() # Order rollouts in decreasing list using cum reward as criterion unsorted_rollouts = [(positive_rewards[i], negative_rewards[i], deltas[i]) for i in range(self.policy.num_deltas)] # When sorting, take the max between the reward for +- disturbance sorted_rollouts = sorted( unsorted_rollouts, key=lambda x: max(unsorted_rollouts[0], unsorted_rollouts[1]), reverse=True) # Only take first best_num_deltas rollouts rollouts = sorted_rollouts[:self.policy.num_best_deltas] # Update Policy self.policy.update(rollouts, std_dev_rewards) # Execute Current Policy eval_reward = self.deploy() return eval_reward def train_parallel(self, parentPipes): """ Execute rollouts in parallel using multiprocessing library based on: # https://github.com/bulletphysics/bullet3/blob/master/examples/pybullet/gym/pybullet_envs/ARS/ars.py """ # USE VANILLA OR TG POLICY if self.TGP is None: exploration = _EXPLORE else: exploration = _EXPLORE_TG # Initializing the perturbations deltas and the positive/negative rewards deltas = self.policy.sample_deltas() # Initialize +- reward list of size num_deltas positive_rewards = [0] * self.policy.num_deltas negative_rewards = [0] * self.policy.num_deltas smach = copy.deepcopy(self.smach) smach.ClearanceHeight = self.BaseClearanceHeight smach.PenetrationDepth = self.BasePenetrationDepth smach.reshuffle() if parentPipes: for i in range(self.policy.num_deltas): # Execute each rollout on a separate thread parentPipe = parentPipes[i] # NOTE: target for parentPipe specified in main_ars.py # (target is ParallelWorker fcn defined up top) parentPipe.send([ exploration, [ self.normalizer, self.policy, "+", deltas[i], self.desired_velocity, self.desired_rate, self.TGP, smach, self.spot ] ]) for i in range(self.policy.num_deltas): # Receive cummulative reward from each rollout positive_rewards[i] = parentPipes[i].recv()[0] for i in range(self.policy.num_deltas): # Execute each rollout on a separate thread parentPipe = parentPipes[i] parentPipe.send([ exploration, [ self.normalizer, self.policy, "-", deltas[i], self.desired_velocity, self.desired_rate, self.TGP, smach, self.spot ] ]) for i in range(self.policy.num_deltas): # Receive cummulative reward from each rollout negative_rewards[i] = parentPipes[i].recv()[0] else: raise ValueError( "Select 'train' method if you are not using multiprocessing!") # Calculate std dev std_dev_rewards = np.array(positive_rewards + negative_rewards).std() # Order rollouts in decreasing list using cum reward as criterion # take max between reward for +- disturbance as that rollout's reward # Store max between positive and negative reward as key for sort scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } indeces = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:self.policy.num_deltas] # print("INDECES: ", indeces) rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in indeces] # Update Policy self.policy.update(rollouts, std_dev_rewards) # Execute Current Policy USING VANILLA OR TG if self.TGP is None: return self.deploy() else: return self.deployTG() def save(self, filename): """ Save the Policy :param filename: the name of the file where the policy is saved """ with open(filename + '_policy', 'wb') as filehandle: pickle.dump(self.policy.theta, filehandle) def load(self, filename): """ Load the Policy :param filename: the name of the file where the policy is saved """ with open(filename + '_policy', 'rb') as filehandle: self.policy.theta = pickle.load(filehandle)