def make_train_plots_ars(log=None, log_path=None, keys=None, save_loc=None): """ Plots and saves images of all logged data :param log : A dictionary containing lists of data we want :param log_path : Path to a csv file that contains all the logged data :param keys : The keys of the dictionary, for the data you want to plot :param save_loc : Location where you want to save the images :returns : Nothing, saves the figures of the plot """ if log is None and log_path is None: print("Need to provide either the log or path to a log file") if log is None: logger = DataLog() logger.read_log(log_path) log = logger.log plt.figure(figsize=(10, 6)) plt.plot(log[keys[0]], log[keys[1]]) plt.title(keys[1]) plt.savefig(save_loc + '/' + keys[1] + '.png', dpi=100) plt.close()
def __init__( self, render=False, on_rack=False, gait='trot', phase=[0, no_of_points, no_of_points, 0], #[FR, FL, BR, BL] action_dim=20, end_steps=1000, stairs=False, downhill=False, seed_value=100, wedge=False, IMU_Noise=False, deg=5): # deg = 5 self._is_stairs = stairs self._is_wedge = wedge self._is_render = render self._on_rack = on_rack self.rh_along_normal = 0.24 self.seed_value = seed_value random.seed(self.seed_value) if self._is_render: self._pybullet_client = bullet_client.BulletClient( connection_mode=pybullet.GUI) else: self._pybullet_client = bullet_client.BulletClient() self._theta = 0 self._frequency = 2.5 # originally 2.5, changing for stability self.termination_steps = end_steps self.downhill = downhill #PD gains self._kp = 400 self._kd = 10 self.dt = 0.01 self._frame_skip = 50 self._n_steps = 0 self._action_dim = action_dim self._obs_dim = 11 self.action = np.zeros(self._action_dim) self._last_base_position = [0, 0, 0] self.last_yaw = 0 self._distance_limit = float("inf") self.current_com_height = 0.25 # 0.243 #wedge_parameters self.wedge_start = 0.5 self.wedge_halflength = 2 if gait is 'trot': phase = [0, no_of_points, no_of_points, 0] elif gait is 'walk': phase = [0, no_of_points, 3 * no_of_points / 2, no_of_points / 2] self._walkcon = walking_controller.WalkingController(gait_type=gait, phase=phase) self.inverse = False self._cam_dist = 1.0 self._cam_yaw = 0.0 self._cam_pitch = 0.0 self.avg_vel_per_step = 0 self.avg_omega_per_step = 0 self.linearV = 0 self.angV = 0 self.prev_vel = [0, 0, 0] self.x_f = 0 self.y_f = 0 self.clips = 7 self.friction = 0.6 self.ori_history_length = 3 self.ori_history_queue = deque( [0] * 3 * self.ori_history_length, maxlen=3 * self.ori_history_length) #observation queue self.step_disp = deque([0] * 100, maxlen=100) self.stride = 5 self.incline_deg = deg self.incline_ori = 0 self.prev_incline_vec = (0, 0, 1) self.terrain_pitch = [] self.add_IMU_noise = IMU_Noise self.support_plane_estimated_pitch = 0 self.support_plane_estimated_roll = 0 self.pertub_steps = 0 self.x_f = 0 self.y_f = 0 self.state_pos = StatePositionIndex() self.reset_pos = InitPosition() self.state_vel = StateVelocityIndex() self.actuaor = ActuatorIndex() self.limits = JoinLimit() mujoco_env.MujocoEnv.__ini__(self, stoch23.xml, 1) utils.EzPickle.__init__(self) ## Gym env related mandatory variables self._obs_dim = 3 * self.ori_history_length + 2 #[r,p,y]x previous time steps, suport plane roll and pitch observation_high = np.array([np.pi / 2] * self._obs_dim) observation_low = -observation_high self.observation_space = spaces.Box(observation_low, observation_high) action_high = np.array([1] * self._action_dim) self.action_space = spaces.Box(-action_high, action_high) self.commands = np.array( [0, 0, 0] ) #Joystick commands consisting of cmd_x_velocity, cmd_y_velocity, cmd_ang_velocity self.max_linear_xvel = 0.35 # calculation is < 0.2 m steplength times the frequency 2.5 Hz self.max_linear_yvel = 0 #0.25, made zero for only x direction # calculation is < 0.14 m times the frequency 2.5 Hz self.max_ang_vel = 0 #6, made zero for only x direction # less than one complete rotation in one second self.max_steplength = 0.2 # by the kinematic limits of the robot self.max_x_shift = 0.1 #plus minus 0.1 m self.max_y_shift = 0.14 # max 30 degree abduction self.max_z_shift = 0.1 # plus minus 0.1 m self.max_incline = 15 # in deg self.hard_reset() self.Set_Randomization(default=True, idx1=2, idx2=2) self.logger = DataLog() if (self._is_stairs): boxHalfLength = 0.1 boxHalfWidth = 1 boxHalfHeight = 0.015 sh_colBox = self._pybullet_client.createCollisionShape( self._pybullet_client.GEOM_BOX, halfExtents=[boxHalfLength, boxHalfWidth, boxHalfHeight]) boxOrigin = 0.3 n_steps = 15 self.stairs = [] for i in range(n_steps): step = self._pybullet_client.createMultiBody( baseMass=0, baseCollisionShapeIndex=sh_colBox, basePosition=[ boxOrigin + i * 2 * boxHalfLength, 0, boxHalfHeight + i * 2 * boxHalfHeight ], baseOrientation=[0.0, 0.0, 0.0, 1]) self.stairs.append(step) self._pybullet_client.changeDynamics(step, -1, lateralFriction=0.8)
def train(env, policy, hp, parentPipes, args): args.logdir = "experiments/" + args.logdir logger = DataLog() total_steps = 0 best_return = -99999999 working_dir = os.getcwd() if os.path.isdir(args.logdir) == False: os.mkdir(args.logdir) previous_dir = os.getcwd() os.chdir(args.logdir) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') log_dir = os.getcwd() os.chdir(working_dir) for step in range(hp.nb_steps): if hp.domain_Rand: env.Set_Randomization(default=False) else: env.randomize_only_inclines() #Cirriculum learning if (step > hp.curilearn): avail_deg = [7, 9, 11, 11] env.incline_deg = avail_deg[random.randint(0, 3)] else: avail_deg = [5, 7, 9] env.incline_deg = avail_deg[random.randint(0, 2)] # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas() positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions if (parentPipes): process_count = len(parentPipes) if parentPipes: p = 0 while (p < hp.nb_directions): temp_p = p n_left = hp.nb_directions - p #Number of processes required to complete the search for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send( [_EXPLORE, [policy, hp, "positive", deltas[temp_p]]]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): positive_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send( [_EXPLORE, [policy, hp, "negative", deltas[temp_p]]]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): negative_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 p = p + process_count print('total steps till now: ', total_steps, 'Processes done: ', p) else: # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k] = explore(env, policy, "positive", deltas[k], hp) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k] = explore(env, policy, "negative", deltas[k], hp) # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:int(hp.nb_best_directions)] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) #Start evaluating after only second stage if step >= hp.curilearn: # policy evaluation after specified iterations if step % hp.evalstep == 0: reward_evaluation = policyevaluation(env, policy, hp) logger.log_kv('steps', step) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save(log_dir + "/iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = log_dir + "/iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log(log_dir + "/logs/") make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc=log_dir + "/logs/")
def __init__( self, render=False, on_rack=False, gait='trot', phase=[0, PI, PI, 0], #[FR, FL, BR, BL] action_dim=20, end_steps=1000, stairs=False, downhill=False, seed_value=100, wedge=False, IMU_Noise=False, deg=5): # deg = 5 self.robot = Simulator.StochliteLoader(render=render, on_rack=on_rack) print(self.robot) self._is_stairs = stairs self._is_wedge = wedge self._is_render = render self._on_rack = on_rack self.rh_along_normal = 0.24 self.seed_value = seed_value random.seed(self.seed_value) # if self._is_render: # self._pybullet_client = bullet_client.BulletClient(connection_mode=pybullet.GUI) # else: # self._pybullet_client = bullet_client.BulletClient() # self._theta = 0 self._frequency = 2.5 # originally 2.5, changing for stability self.termination_steps = end_steps self.downhill = downhill #PD gains self._kp = 400 self._kd = 10 self.dt = 0.01 self._frame_skip = 50 self._n_steps = 0 self._action_dim = action_dim self._obs_dim = 11 self.action = np.zeros(self._action_dim) self._last_base_position = [0, 0, 0] self.last_rpy = [0, 0, 0] self._distance_limit = float("inf") self.current_com_height = 0.25 # 0.243 #wedge_parameters self.wedge_start = 0.5 self.wedge_halflength = 2 if gait is 'trot': phase = [0, PI, PI, 0] elif gait is 'walk': phase = [0, PI, 3 * PI / 2, PI / 2] self._trajgen = trajectory_generator.TrajectoryGenerator( gait_type=gait, phase=phase) self.inverse = False self._cam_dist = 1.0 self._cam_yaw = 0.0 self._cam_pitch = 0.0 self.avg_vel_per_step = 0 self.avg_omega_per_step = 0 self.linearV = 0 self.angV = 0 self.prev_vel = [0, 0, 0] self.prev_ang_vels = [0, 0, 0] # roll_vel, pitch_vel, yaw_vel of prev step self.total_power = 0 self.x_f = 0 self.y_f = 0 self.clips = 7 self.friction = 0.6 # self.ori_history_length = 3 # self.ori_history_queue = deque([0]*3*self.ori_history_length, # maxlen=3*self.ori_history_length)#observation queue self.step_disp = deque([0] * 100, maxlen=100) self.stride = 5 self.incline_deg = deg self.incline_ori = 0 self.prev_incline_vec = (0, 0, 1) self.terrain_pitch = [] self.add_IMU_noise = IMU_Noise self.INIT_POSITION = [ 0, 0, 0.3 ] # [0,0,0.3], Spawning stochlite higher to remove initial drift self.INIT_ORIENTATION = [0, 0, 0, 1] self.support_plane_estimated_pitch = 0 self.support_plane_estimated_roll = 0 self.pertub_steps = 0 self.x_f = 0 self.y_f = 0 ## Gym env related mandatory variables self._obs_dim = 10 #[roll, pitch, roll_vel, pitch_vel, yaw_vel, SP roll, SP pitch, cmd_xvel, cmd_yvel, cmd_avel] observation_high = np.array([np.pi / 2] * self._obs_dim) observation_low = -observation_high self.observation_space = spaces.Box(observation_low, observation_high) action_high = np.array([1] * self._action_dim) self.action_space = spaces.Box(-action_high, action_high) self.commands = np.array( [0, 0, 0] ) #Joystick commands consisting of cmd_x_velocity, cmd_y_velocity, cmd_ang_velocity self.max_linear_xvel = 0.5 #0.4, made zero for only ang vel # calculation is < 0.2 m steplength times the frequency 2.5 Hz self.max_linear_yvel = 0.25 #0.25, made zero for only ang vel # calculation is < 0.14 m times the frequency 2.5 Hz self.max_ang_vel = 3.5 #considering less than pi/2 steer angle # less than one complete rotation in one second self.max_steplength = 0.2 # by the kinematic limits of the robot self.max_steer_angle = PI / 2 #plus minus PI/2 rads self.max_x_shift = 0.1 #plus minus 0.1 m self.max_y_shift = 0.14 # max 30 degree abduction self.max_z_shift = 0.1 # plus minus 0.1 m self.max_incline = 15 # in deg self.robot_length = 0.334 # measured from stochlite self.robot_width = 0.192 # measured from stochlite # self.robot.hard_reset() #print('render',render, on_rack) self.Set_Randomization(default=True, idx1=2, idx2=2) self.logger = DataLog() if (self._is_stairs): boxHalfLength = 0.1 boxHalfWidth = 1 boxHalfHeight = 0.015 sh_colBox = self.robot.create_collision_shape( self.robot.geom_box(), halfExtents=[boxHalfLength, boxHalfWidth, boxHalfHeight]) boxOrigin = 0.3 n_steps = 15 self.stairs = [] for i in range(n_steps): step = self.robot.create_multi_body( baseMass=0, baseCollisionShapeIndex=sh_colBox, basePosition=[ boxOrigin + i * 2 * boxHalfLength, 0, boxHalfHeight + i * 2 * boxHalfHeight ], baseOrientation=[0.0, 0.0, 0.0, 1]) self.stairs.append(step) self.robot.change_dynamics(step, -1, lateralFriction=0.8)
def train(env, policy, normalizer, hp, parentPipes, args): logger = DataLog() total_steps = 0 best_return = -99999999 if os.path.isdir(args.logdir) == False: os.mkdir(args.logdir) previous_dir = os.getcwd() os.chdir(args.logdir) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') for step in range(hp.nb_steps): # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas() positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions if (parentPipes): process_count = len(parentPipes) if parentPipes: p = 0 while (p < hp.nb_directions): temp_p = p n_left = hp.nb_directions - p #Number of processes required to complete the search for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "positive", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): positive_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "negative", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): negative_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 p = p + process_count # print('mp step has worked, ', p) print('total steps till now: ', total_steps, 'Processes done: ', p) else: # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k] = explore(env, policy, "positive", deltas[k], hp) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k] = explore(env, policy, "negative", deltas[k], hp) # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:int(hp.nb_best_directions)] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) # Printing the final reward of the policy after the update reward_evaluation = explore(env, policy, None, None, hp) logger.log_kv('steps', step) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save("iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = "iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log('logs/') make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc='logs/')