コード例 #1
0
def make_train_plots_ars(log=None, log_path=None, keys=None, save_loc=None):
    """
    Plots and saves images of all logged data
    :param log : A dictionary containing lists of data we want
    :param log_path : Path to a csv file that contains all the logged data
    :param keys : The keys of the dictionary, for the data you want to plot
    :param save_loc : Location where you want to save the images
    :returns : Nothing, saves the figures of the plot
    """
    if log is None and log_path is None:
        print("Need to provide either the log or path to a log file")
    if log is None:
        logger = DataLog()
        logger.read_log(log_path)
        log = logger.log

    plt.figure(figsize=(10, 6))
    plt.plot(log[keys[0]], log[keys[1]])
    plt.title(keys[1])
    plt.savefig(save_loc + '/' + keys[1] + '.png', dpi=100)
    plt.close()
コード例 #2
0
    def __init__(
            self,
            render=False,
            on_rack=False,
            gait='trot',
            phase=[0, no_of_points, no_of_points, 0],  #[FR, FL, BR, BL] 
            action_dim=20,
            end_steps=1000,
            stairs=False,
            downhill=False,
            seed_value=100,
            wedge=False,
            IMU_Noise=False,
            deg=5):  # deg = 5

        self._is_stairs = stairs
        self._is_wedge = wedge
        self._is_render = render
        self._on_rack = on_rack
        self.rh_along_normal = 0.24

        self.seed_value = seed_value
        random.seed(self.seed_value)

        if self._is_render:
            self._pybullet_client = bullet_client.BulletClient(
                connection_mode=pybullet.GUI)
        else:
            self._pybullet_client = bullet_client.BulletClient()

        self._theta = 0

        self._frequency = 2.5  # originally 2.5, changing for stability
        self.termination_steps = end_steps
        self.downhill = downhill

        #PD gains
        self._kp = 400
        self._kd = 10

        self.dt = 0.01
        self._frame_skip = 50
        self._n_steps = 0
        self._action_dim = action_dim

        self._obs_dim = 11

        self.action = np.zeros(self._action_dim)

        self._last_base_position = [0, 0, 0]
        self.last_yaw = 0
        self._distance_limit = float("inf")

        self.current_com_height = 0.25  # 0.243

        #wedge_parameters
        self.wedge_start = 0.5
        self.wedge_halflength = 2

        if gait is 'trot':
            phase = [0, no_of_points, no_of_points, 0]
        elif gait is 'walk':
            phase = [0, no_of_points, 3 * no_of_points / 2, no_of_points / 2]
        self._walkcon = walking_controller.WalkingController(gait_type=gait,
                                                             phase=phase)
        self.inverse = False
        self._cam_dist = 1.0
        self._cam_yaw = 0.0
        self._cam_pitch = 0.0

        self.avg_vel_per_step = 0
        self.avg_omega_per_step = 0

        self.linearV = 0
        self.angV = 0
        self.prev_vel = [0, 0, 0]

        self.x_f = 0
        self.y_f = 0

        self.clips = 7

        self.friction = 0.6
        self.ori_history_length = 3
        self.ori_history_queue = deque(
            [0] * 3 * self.ori_history_length,
            maxlen=3 * self.ori_history_length)  #observation queue

        self.step_disp = deque([0] * 100, maxlen=100)
        self.stride = 5

        self.incline_deg = deg
        self.incline_ori = 0

        self.prev_incline_vec = (0, 0, 1)

        self.terrain_pitch = []
        self.add_IMU_noise = IMU_Noise

        self.support_plane_estimated_pitch = 0
        self.support_plane_estimated_roll = 0

        self.pertub_steps = 0
        self.x_f = 0
        self.y_f = 0

        self.state_pos = StatePositionIndex()
        self.reset_pos = InitPosition()
        self.state_vel = StateVelocityIndex()
        self.actuaor = ActuatorIndex()
        self.limits = JoinLimit()

        mujoco_env.MujocoEnv.__ini__(self, stoch23.xml, 1)
        utils.EzPickle.__init__(self)

        ## Gym env related mandatory variables
        self._obs_dim = 3 * self.ori_history_length + 2  #[r,p,y]x previous time steps, suport plane roll and pitch
        observation_high = np.array([np.pi / 2] * self._obs_dim)
        observation_low = -observation_high
        self.observation_space = spaces.Box(observation_low, observation_high)

        action_high = np.array([1] * self._action_dim)
        self.action_space = spaces.Box(-action_high, action_high)

        self.commands = np.array(
            [0, 0, 0]
        )  #Joystick commands consisting of cmd_x_velocity, cmd_y_velocity, cmd_ang_velocity
        self.max_linear_xvel = 0.35  # calculation is < 0.2 m steplength times the frequency 2.5 Hz
        self.max_linear_yvel = 0  #0.25, made zero for only x direction # calculation is < 0.14 m times the frequency 2.5 Hz
        self.max_ang_vel = 0  #6, made zero for only x direction # less than one complete rotation in one second
        self.max_steplength = 0.2  # by the kinematic limits of the robot
        self.max_x_shift = 0.1  #plus minus 0.1 m
        self.max_y_shift = 0.14  # max 30 degree abduction
        self.max_z_shift = 0.1  # plus minus 0.1 m
        self.max_incline = 15  # in deg

        self.hard_reset()

        self.Set_Randomization(default=True, idx1=2, idx2=2)

        self.logger = DataLog()

        if (self._is_stairs):
            boxHalfLength = 0.1
            boxHalfWidth = 1
            boxHalfHeight = 0.015
            sh_colBox = self._pybullet_client.createCollisionShape(
                self._pybullet_client.GEOM_BOX,
                halfExtents=[boxHalfLength, boxHalfWidth, boxHalfHeight])
            boxOrigin = 0.3
            n_steps = 15
            self.stairs = []
            for i in range(n_steps):
                step = self._pybullet_client.createMultiBody(
                    baseMass=0,
                    baseCollisionShapeIndex=sh_colBox,
                    basePosition=[
                        boxOrigin + i * 2 * boxHalfLength, 0,
                        boxHalfHeight + i * 2 * boxHalfHeight
                    ],
                    baseOrientation=[0.0, 0.0, 0.0, 1])
                self.stairs.append(step)
                self._pybullet_client.changeDynamics(step,
                                                     -1,
                                                     lateralFriction=0.8)
コード例 #3
0
ファイル: train_policy.py プロジェクト: ppalcx/Stoch3_foot
def train(env, policy, hp, parentPipes, args):
    args.logdir = "experiments/" + args.logdir
    logger = DataLog()
    total_steps = 0
    best_return = -99999999

    working_dir = os.getcwd()

    if os.path.isdir(args.logdir) == False:
        os.mkdir(args.logdir)

    previous_dir = os.getcwd()

    os.chdir(args.logdir)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')

    log_dir = os.getcwd()
    os.chdir(working_dir)

    for step in range(hp.nb_steps):
        if hp.domain_Rand:
            env.Set_Randomization(default=False)
        else:
            env.randomize_only_inclines()
        #Cirriculum learning
        if (step > hp.curilearn):
            avail_deg = [7, 9, 11, 11]
            env.incline_deg = avail_deg[random.randint(0, 3)]
        else:
            avail_deg = [5, 7, 9]
            env.incline_deg = avail_deg[random.randint(0, 2)]

        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        if (parentPipes):
            process_count = len(parentPipes)
        if parentPipes:
            p = 0
            while (p < hp.nb_directions):
                temp_p = p
                n_left = hp.nb_directions - p  #Number of processes required to complete the search
                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send(
                        [_EXPLORE, [policy, hp, "positive", deltas[temp_p]]])
                    temp_p = temp_p + 1
                temp_p = p
                for k in range(min([process_count, n_left])):
                    positive_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send(
                        [_EXPLORE, [policy, hp, "negative", deltas[temp_p]]])
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    negative_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                p = p + process_count
                print('total steps till now: ', total_steps,
                      'Processes done: ', p)

        else:
            # Getting the positive rewards in the positive directions
            for k in range(hp.nb_directions):
                positive_rewards[k] = explore(env, policy, "positive",
                                              deltas[k], hp)

            # Getting the negative rewards in the negative/opposite directions
            for k in range(hp.nb_directions):
                negative_rewards[k] = explore(env, policy, "negative",
                                              deltas[k], hp)

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:int(hp.nb_best_directions)]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be
        # Updating our policy
        policy.update(rollouts, sigma_r, args)

        #Start evaluating after only second stage
        if step >= hp.curilearn:
            # policy evaluation after specified iterations
            if step % hp.evalstep == 0:
                reward_evaluation = policyevaluation(env, policy, hp)
                logger.log_kv('steps', step)
                logger.log_kv('return', reward_evaluation)
                if (reward_evaluation > best_return):
                    best_policy = policy.theta
                    best_return = reward_evaluation
                    np.save(log_dir + "/iterations/best_policy.npy",
                            best_policy)
                print('Step:', step, 'Reward:', reward_evaluation)
                policy_path = log_dir + "/iterations/" + "policy_" + str(step)
                np.save(policy_path, policy.theta)

                logger.save_log(log_dir + "/logs/")
                make_train_plots_ars(log=logger.log,
                                     keys=['steps', 'return'],
                                     save_loc=log_dir + "/logs/")
コード例 #4
0
    def __init__(
            self,
            render=False,
            on_rack=False,
            gait='trot',
            phase=[0, PI, PI, 0],  #[FR, FL, BR, BL] 
            action_dim=20,
            end_steps=1000,
            stairs=False,
            downhill=False,
            seed_value=100,
            wedge=False,
            IMU_Noise=False,
            deg=5):  # deg = 5

        self.robot = Simulator.StochliteLoader(render=render, on_rack=on_rack)
        print(self.robot)
        self._is_stairs = stairs
        self._is_wedge = wedge
        self._is_render = render
        self._on_rack = on_rack
        self.rh_along_normal = 0.24

        self.seed_value = seed_value
        random.seed(self.seed_value)

        # if self._is_render:
        # 	self._pybullet_client = bullet_client.BulletClient(connection_mode=pybullet.GUI)
        # else:
        # 	self._pybullet_client = bullet_client.BulletClient()

        # self._theta = 0

        self._frequency = 2.5  # originally 2.5, changing for stability
        self.termination_steps = end_steps
        self.downhill = downhill

        #PD gains
        self._kp = 400
        self._kd = 10

        self.dt = 0.01
        self._frame_skip = 50
        self._n_steps = 0
        self._action_dim = action_dim

        self._obs_dim = 11

        self.action = np.zeros(self._action_dim)

        self._last_base_position = [0, 0, 0]
        self.last_rpy = [0, 0, 0]
        self._distance_limit = float("inf")

        self.current_com_height = 0.25  # 0.243

        #wedge_parameters
        self.wedge_start = 0.5
        self.wedge_halflength = 2

        if gait is 'trot':
            phase = [0, PI, PI, 0]
        elif gait is 'walk':
            phase = [0, PI, 3 * PI / 2, PI / 2]
        self._trajgen = trajectory_generator.TrajectoryGenerator(
            gait_type=gait, phase=phase)
        self.inverse = False
        self._cam_dist = 1.0
        self._cam_yaw = 0.0
        self._cam_pitch = 0.0

        self.avg_vel_per_step = 0
        self.avg_omega_per_step = 0

        self.linearV = 0
        self.angV = 0
        self.prev_vel = [0, 0, 0]
        self.prev_ang_vels = [0, 0,
                              0]  # roll_vel, pitch_vel, yaw_vel of prev step
        self.total_power = 0

        self.x_f = 0
        self.y_f = 0

        self.clips = 7

        self.friction = 0.6
        # self.ori_history_length = 3
        # self.ori_history_queue = deque([0]*3*self.ori_history_length,
        #                             maxlen=3*self.ori_history_length)#observation queue

        self.step_disp = deque([0] * 100, maxlen=100)
        self.stride = 5

        self.incline_deg = deg
        self.incline_ori = 0

        self.prev_incline_vec = (0, 0, 1)

        self.terrain_pitch = []
        self.add_IMU_noise = IMU_Noise

        self.INIT_POSITION = [
            0, 0, 0.3
        ]  # [0,0,0.3], Spawning stochlite higher to remove initial drift
        self.INIT_ORIENTATION = [0, 0, 0, 1]

        self.support_plane_estimated_pitch = 0
        self.support_plane_estimated_roll = 0

        self.pertub_steps = 0
        self.x_f = 0
        self.y_f = 0

        ## Gym env related mandatory variables
        self._obs_dim = 10  #[roll, pitch, roll_vel, pitch_vel, yaw_vel, SP roll, SP pitch, cmd_xvel, cmd_yvel, cmd_avel]
        observation_high = np.array([np.pi / 2] * self._obs_dim)
        observation_low = -observation_high
        self.observation_space = spaces.Box(observation_low, observation_high)

        action_high = np.array([1] * self._action_dim)
        self.action_space = spaces.Box(-action_high, action_high)

        self.commands = np.array(
            [0, 0, 0]
        )  #Joystick commands consisting of cmd_x_velocity, cmd_y_velocity, cmd_ang_velocity
        self.max_linear_xvel = 0.5  #0.4, made zero for only ang vel # calculation is < 0.2 m steplength times the frequency 2.5 Hz
        self.max_linear_yvel = 0.25  #0.25, made zero for only ang vel # calculation is < 0.14 m times the frequency 2.5 Hz
        self.max_ang_vel = 3.5  #considering less than pi/2 steer angle # less than one complete rotation in one second
        self.max_steplength = 0.2  # by the kinematic limits of the robot
        self.max_steer_angle = PI / 2  #plus minus PI/2 rads
        self.max_x_shift = 0.1  #plus minus 0.1 m
        self.max_y_shift = 0.14  # max 30 degree abduction
        self.max_z_shift = 0.1  # plus minus 0.1 m
        self.max_incline = 15  # in deg
        self.robot_length = 0.334  # measured from stochlite
        self.robot_width = 0.192  # measured from stochlite

        # self.robot.hard_reset()
        #print('render',render, on_rack)

        self.Set_Randomization(default=True, idx1=2, idx2=2)

        self.logger = DataLog()

        if (self._is_stairs):
            boxHalfLength = 0.1
            boxHalfWidth = 1
            boxHalfHeight = 0.015
            sh_colBox = self.robot.create_collision_shape(
                self.robot.geom_box(),
                halfExtents=[boxHalfLength, boxHalfWidth, boxHalfHeight])
            boxOrigin = 0.3
            n_steps = 15
            self.stairs = []
            for i in range(n_steps):
                step = self.robot.create_multi_body(
                    baseMass=0,
                    baseCollisionShapeIndex=sh_colBox,
                    basePosition=[
                        boxOrigin + i * 2 * boxHalfLength, 0,
                        boxHalfHeight + i * 2 * boxHalfHeight
                    ],
                    baseOrientation=[0.0, 0.0, 0.0, 1])
                self.stairs.append(step)
                self.robot.change_dynamics(step, -1, lateralFriction=0.8)
コード例 #5
0
def train(env, policy, normalizer, hp, parentPipes, args):
    logger = DataLog()
    total_steps = 0
    best_return = -99999999
    if os.path.isdir(args.logdir) == False:
        os.mkdir(args.logdir)
    previous_dir = os.getcwd()
    os.chdir(args.logdir)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')

    for step in range(hp.nb_steps):

        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        if (parentPipes):
            process_count = len(parentPipes)
        if parentPipes:
            p = 0
            while (p < hp.nb_directions):
                temp_p = p
                n_left = hp.nb_directions - p  #Number of processes required to complete the search
                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "positive", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p
                for k in range(min([process_count, n_left])):
                    positive_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "negative", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    negative_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                p = p + process_count

                # print('mp step has worked, ', p)
                print('total steps till now: ', total_steps,
                      'Processes done: ', p)

        else:
            # Getting the positive rewards in the positive directions
            for k in range(hp.nb_directions):
                positive_rewards[k] = explore(env, policy, "positive",
                                              deltas[k], hp)

            # Getting the negative rewards in the negative/opposite directions
            for k in range(hp.nb_directions):
                negative_rewards[k] = explore(env, policy, "negative",
                                              deltas[k], hp)

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:int(hp.nb_best_directions)]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be
        # Updating our policy
        policy.update(rollouts, sigma_r, args)

        # Printing the final reward of the policy after the update
        reward_evaluation = explore(env, policy, None, None, hp)
        logger.log_kv('steps', step)
        logger.log_kv('return', reward_evaluation)
        if (reward_evaluation > best_return):
            best_policy = policy.theta
            best_return = reward_evaluation
            np.save("iterations/best_policy.npy", best_policy)
        print('Step:', step, 'Reward:', reward_evaluation)
        policy_path = "iterations/" + "policy_" + str(step)
        np.save(policy_path, policy.theta)
        logger.save_log('logs/')
        make_train_plots_ars(log=logger.log,
                             keys=['steps', 'return'],
                             save_loc='logs/')