コード例 #1
0
 def __init__(self, config, replay_buffer, predict_reward_and_status_func):
     self.config = config
     self.replay_buffer = replay_buffer
     self.target_potential_point = PotentialPoint.from_config(config)[-1]
     self.predict_reward_and_status_func = predict_reward_and_status_func
     # the following buffer saves the transition we are about to add
     self.augmented_buffer = []
コード例 #2
0
def print_model_stats(pre_trained_reward_network, test_batch_size, sess):
    # read the data
    test = load_data_from(os.path.join("supervised_data", "test"),
                          max_read=10 * test_batch_size)
    print(len(test))

    # partition to train and test
    random.shuffle(test)

    openrave_manager = OpenraveManager(
        0.001, PotentialPoint.from_config(pre_trained_reward_network.config))

    sess.run(tf.global_variables_initializer())

    # run test for one (random) batch
    random.shuffle(test)
    test_batch = oversample_batch(test, 0, test_batch_size)
    test_batch, test_rewards, test_status = get_batch_and_labels(
        test_batch, openrave_manager)
    reward_prediction, status_prediction = pre_trained_reward_network.make_prediction(
        *([sess] + test_batch))
    # see what happens for different reward classes:
    (
        goal_rewards_stats,
        collision_rewards_stats,
        other_rewards_stats,
    ) = compute_stats_per_class(test_status, test_rewards, status_prediction,
                                reward_prediction)
    print("before loading weights")
    print("goal mean_error {} max_error {} accuracy {}".format(
        *goal_rewards_stats))
    print("collision mean_error {} max_error {} accuracy {}".format(
        *collision_rewards_stats))
    print("other mean_error {} max_error {} accuracy {}".format(
        *other_rewards_stats))

    # load weights
    pre_trained_reward_network.load_weights(sess)
    # run test for one (random) batch
    random.shuffle(test)

    test_batch = oversample_batch(test, 0, test_batch_size)
    test_batch, test_rewards, test_status = get_batch_and_labels(
        test_batch, openrave_manager)
    reward_prediction, status_prediction = pre_trained_reward_network.make_prediction(
        *([sess] + test_batch))
    # see what happens for different reward classes:
    (
        goal_rewards_stats,
        collision_rewards_stats,
        other_rewards_stats,
    ) = compute_stats_per_class(test_status, test_rewards, status_prediction,
                                reward_prediction)
    print("after loading weights")
    print("goal mean_error {} max_error {} accuracy {}".format(
        *goal_rewards_stats))
    print("collision mean_error {} max_error {} accuracy {}".format(
        *collision_rewards_stats))
    print("other mean_error {} max_error {} accuracy {}".format(
        *other_rewards_stats))
コード例 #3
0
 def _generate_single_workspace(self, workspace_id):
     while True:
         a = datetime.datetime.now()
         workspace_params = self.generator.generate_workspace()
         self.openrave_manager = OpenraveManager(self.config['openrave_rl']['segment_validity_step'],
                                                 PotentialPoint.from_config(self.config))
         self.openrave_manager.loaded_params_path = None
         self.openrave_manager.load_params(workspace_params, '')
         successful_trajectories_count = 0
         i = 0
         for i in range(self.test_trajectories):
             # see if there is hope
             trajectories_left = self.test_trajectories - i
             if trajectories_left + successful_trajectories_count < self.trajectories_required_to_pass:
                 print 'no hope to get the required ratio'
                 break
             # try a trajectory
             successful_trajectories_count += self._try_plan(workspace_params, self.openrave_manager) is not None
             # if successful update the status
             if successful_trajectories_count >= self.trajectories_required_to_pass:
                 print 'workspace found'
                 save_path = os.path.join(output_dir, '{}_workspace.pkl'.format(workspace_id))
                 workspace_params.save(save_path)
                 return
         b = datetime.datetime.now()
         print 'trajectories tried {}'.format(i)
         print 'success count {}'.format(successful_trajectories_count)
         print 'time since start {}'.format(b - a)
         print ''
コード例 #4
0
 def _generate_single_workspace(self, workspace_id):
     while True:
         a = datetime.datetime.now()
         workspace_params = self.generator.generate_workspace()
         self.openrave_manager = OpenraveManager(
             self.config["openrave_rl"]["segment_validity_step"],
             PotentialPoint.from_config(self.config),
         )
         self.openrave_manager.loaded_params_path = None
         self.openrave_manager.load_params(workspace_params, "")
         successful_trajectories_count = 0
         i = 0
         for i in range(self.test_trajectories):
             # see if there is hope
             trajectories_left = self.test_trajectories - i
             if (trajectories_left + successful_trajectories_count <
                     self.trajectories_required_to_pass):
                 print("no hope to get the required ratio")
                 break
             # try a trajectory
             successful_trajectories_count += (self._try_plan(
                 workspace_params, self.openrave_manager) is not None)
             # if successful update the status
             if successful_trajectories_count >= self.trajectories_required_to_pass:
                 print("workspace found")
                 save_path = os.path.join(
                     output_dir, "{}_workspace.pkl".format(workspace_id))
                 workspace_params.save(save_path)
                 return
         b = datetime.datetime.now()
         print("trajectories tried {}".format(i))
         print("success count {}".format(successful_trajectories_count))
         print("time since start {}".format(b - a))
         print("")
コード例 #5
0
 def __init__(self, config, rollout_manager, results_directory):
     self.config = config
     self.rollout_manager = rollout_manager
     self.results_directory = results_directory
     self._make_dir(self.results_directory)
     potential_points_path = os.path.join(self.results_directory, 'potential_points.p')
     pickle.dump(PotentialPoint.from_config(config), open(potential_points_path, 'w'))
     self._is_vision = config['model']['consider_image']
コード例 #6
0
 def get_manager_for_workspace(workspace_id, config):
     directory = os.path.abspath(
         os.path.expanduser(config['data']['directory']))
     workspace_dir = os.path.join(directory, workspace_id)
     potential_points = PotentialPoint.from_config(config)
     openrave_manager = OpenraveManager(
         config['data']['joint_segment_validity_step'], potential_points)
     workspace_params = WorkspaceParams.load_from_file(
         data_filepaths.get_workspace_params_path(workspace_dir))
     openrave_manager.load_params(workspace_params)
     return openrave_manager, workspace_dir
コード例 #7
0
    def __init__(self, config):
        self.action_step_size = config['openrave_rl']['action_step_size']
        self.goal_sensitivity = config['openrave_rl']['goal_sensitivity']
        self.keep_alive_penalty = config['openrave_rl']['keep_alive_penalty']
        self.truncate_penalty = config['openrave_rl']['truncate_penalty']

        self.openrave_manager = OpenraveManager(
            config['openrave_rl']['segment_validity_step'], PotentialPoint.from_config(config))

        self.current_joints = None
        self.goal_joints = None
        self.start_joints = None
        self.traj = None
コード例 #8
0
def produce_transitions(data_dir, cache_dir):
    print "producing transition data from original trajectories at {}".format(
        data_dir)
    assert os.path.exists(data_dir)

    if os.path.exists(cache_dir):
        print "found cache dir at {}, assuming all transitions are present there (if not delete the directory)".format(
            cache_dir)
        return

    print "cache not found, creating cache at: {}".format(cache_dir)
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    files = [
        file for file in os.listdir(data_dir) if file.endswith(".path_pkl")
    ]
    assert len(files) > 0
    target_point = PotentialPoint.from_config(config)[-1]
    for file in files:
        print "loading file {}".format(file)
        with bz2.BZ2File(os.path.join(data_dir, file), "r") as compressed_file:
            paths = pickle.load(compressed_file)

        print "asserting step sizes match"
        step_size = config["openrave_rl"]["action_step_size"] + 0.00001
        for (traj, _) in paths:
            for i in range(len(traj) - 1):
                assert (
                    np.linalg.norm(np.array(traj[i]) - np.array(traj[i + 1])) <
                    step_size)

        print "creating transitions"
        transitions = []
        for (traj, poses_trajectory) in paths:
            goal_joints = traj[-1]
            goal_pose = poses_trajectory[-1][target_point.tuple]
            for i in range(len(traj) - 1):
                joints = traj[i]
                next_joints = traj[i + 1]
                transition = (joints[1:], next_joints[1:], goal_joints[1:],
                              goal_pose)
                transitions.append(transition)

        transition_file = os.path.join(cache_dir, file + ".transitions_cache")
        print "writing transitions file {}".format(transition_file)
        with open(transition_file, "w") as pickle_file:
            pickle.dump(transitions, pickle_file)
        # with bz2.BZ2File(transition_file, 'w') as compressed_file:
        #     pickle.dump(transitions, compressed_file)

    print "cache created at {}".format(cache_dir)
コード例 #9
0
    def __init__(self, config):
        self.action_step_size = config["openrave_rl"]["action_step_size"]
        self.goal_sensitivity = config["openrave_rl"]["goal_sensitivity"]
        self.keep_alive_penalty = config["openrave_rl"]["keep_alive_penalty"]
        self.truncate_penalty = config["openrave_rl"]["truncate_penalty"]

        self.openrave_manager = OpenraveManager(
            config["openrave_rl"]["segment_validity_step"],
            PotentialPoint.from_config(config),
        )

        self.current_joints = None
        self.goal_joints = None
        self.start_joints = None
        self.traj = None
コード例 #10
0
    def __init__(self, config):
        self.action_step_size = config['openrave_rl']['action_step_size']
        self.goal_sensitivity = config['openrave_rl']['goal_sensitivity']
        self.challenging_trajectories_only = config['openrave_planner'][
            'challenging_trajectories_only']
        self.planner_iterations_start = config['openrave_planner'][
            'planner_iterations_start']
        self.planner_iterations_increase = config['openrave_planner'][
            'planner_iterations_increase']
        self.planner_iterations_decrease = config['openrave_planner'][
            'planner_iterations_decrease']
        self.max_planner_iterations = self.planner_iterations_start

        self.openrave_manager = OpenraveManager(
            config['openrave_rl']['segment_validity_step'],
            PotentialPoint.from_config(config))
コード例 #11
0
    def __init__(self, config):
        self.action_step_size = config["openrave_rl"]["action_step_size"]
        self.goal_sensitivity = config["openrave_rl"]["goal_sensitivity"]
        self.challenging_trajectories_only = config["openrave_planner"][
            "challenging_trajectories_only"]
        self.planner_iterations_start = config["openrave_planner"][
            "planner_iterations_start"]
        self.planner_iterations_increase = config["openrave_planner"][
            "planner_iterations_increase"]
        self.planner_iterations_decrease = config["openrave_planner"][
            "planner_iterations_decrease"]
        self.max_planner_iterations = self.planner_iterations_start

        self.openrave_manager = OpenraveManager(
            config["openrave_rl"]["segment_validity_step"],
            PotentialPoint.from_config(config),
        )
コード例 #12
0
def run_motion_planner():
    result = None
    openrave_manager = OpenraveManager(
        config['openrave_rl']['segment_validity_step'],
        PotentialPoint.from_config(config))
    for start_joints, goal_joints, workspace_id, _ in queries:
        params_file_path = image_cache.items[workspace_id].full_filename
        openrave_manager.set_params(params_file_path)
        for i in range(repeat):
            start_time = datetime.datetime.now()
            traj = openrave_manager.plan(start_joints, goal_joints, None)
            # assert traj is not None
            end_time = datetime.datetime.now()
            time_diff = end_time - start_time
            if result is None:
                result = time_diff
            else:
                result += time_diff
    return result
コード例 #13
0
                    np.linalg.norm(np.array(traj[i]) - np.array(traj[i + 1]))
                    < step_size
                )

        paths_file = os.path.join(cache_dir, file + ".paths_cache")
        print("writing paths file {}".format(paths_file))
        with open(paths_file, "w") as pickle_file:
            pickle.dump(paths, pickle_file)

    print("cache created at {}".format(cache_dir))


train_original_dir = os.path.join("imitation_data", scenario, "train")
train_transitions_dir = os.path.join("imitation_data_transitions", scenario, "train")
train_transitions_dir = os.path.join(
    train_transitions_dir, PotentialPoint.from_config(config)[-1].str
)
produce_transitions(train_original_dir, train_transitions_dir)
train_paths_dir = os.path.join("imitation_data_paths", scenario, "train")
produce_paths(train_original_dir, train_paths_dir)

test_original_dir = os.path.join("imitation_data", scenario, "test")
test_transitions_dir = os.path.join("imitation_data_transitions", scenario, "test")
test_transitions_dir = os.path.join(
    test_transitions_dir, PotentialPoint.from_config(config)[-1].str
)
produce_transitions(test_original_dir, test_transitions_dir)
test_paths_dir = os.path.join("imitation_data_paths", scenario, "test")
produce_paths(test_original_dir, test_paths_dir)

コード例 #14
0
scenario = 'hard'
model_name = '2019_01_25_10_09_04'
number_of_imitation_files = 3
sphere_limitation = 1000

imitation_data_path = os.path.abspath(os.path.expanduser(os.path.join('~/ModelBasedDDPG/imitation_data', scenario)))
rl_trajectories_data_path = os.path.abspath(os.path.expanduser(
    os.path.join('~/ModelBasedDDPG/', scenario, 'trajectories', model_name)))

# load configuration
config_path = os.path.join(os.getcwd(), 'config/config.yml')
with open(config_path, 'r') as yml_file:
    config = yaml.load(yml_file)

# load the workspace
openrave_manager = OpenraveManager(config['openrave_rl']['segment_validity_step'], PotentialPoint.from_config(config))


def process_poses(target_poses, x_coordinate_range=(0.0, 0.13), z_coordinate_range=(0.3, 0.45)):
    return [p for p in target_poses if x_coordinate_range[0] <= p[0] <= x_coordinate_range[1] and z_coordinate_range[0] <= p[1] <= z_coordinate_range[1]]


def process_rl_files(data_dir, trajectory_limitation):
    steps_offset = 40
    steps_increase = 2000
    trajectories_seen = 0
    result = []
    while trajectories_seen < trajectory_limitation:
        global_step_dir = os.path.join(data_dir, '{}'.format(steps_offset))
        steps_offset += steps_increase
        for dirpath, dirnames, filenames in os.walk(global_step_dir):
コード例 #15
0
collision_samples = 10000
# show_close_to_goal = True
show_close_to_goal = False
close_to_goal_samples = 10000
show_pose_action_direction_arrow = True
show_goal_end_effector_pose = True

# load configuration
config_path = os.path.join(os.getcwd(), 'config/config.yml')
with open(config_path, 'r') as yml_file:
    config = yaml.load(yml_file)

# load the workspace
openrave_manager = OpenraveManager(
    config['openrave_rl']['segment_validity_step'],
    PotentialPoint.from_config(config))
params_file = os.path.abspath(
    os.path.expanduser(
        os.path.join('~/ModelBasedDDPG/scenario_params', scenario,
                     'params.pkl')))
openrave_manager.load_params(WorkspaceParams.load_from_file(params_file))
openrave_manager.robot.SetDOFValues([0.0] + goal_joints, [0, 1, 2, 3, 4])

openrave_manager.get_initialized_viewer()
red_color = np.array([1.0, 0.0, 0.0])
yellow_color = np.array([1.0, 1.0, 0.0])
green_color = np.array([0.0, 1.0, 0.0])


def create_sphere(id, radius, openrave_manager):
    body = RaveCreateKinBody(openrave_manager.env, '')
コード例 #16
0
                    np.array(traj[i]) - np.array(traj[i + 1])) < step_size

        paths_file = os.path.join(cache_dir, file + '.paths_cache')
        print 'writing paths file {}'.format(paths_file)
        with open(paths_file, 'w') as pickle_file:
            pickle.dump(paths, pickle_file)

    print 'cache created at {}'.format(cache_dir)


train_original_dir = os.path.join('imitation_data', scenario, 'train')
train_transitions_dir = os.path.join('imitation_data_transitions', scenario,
                                     'train')
train_transitions_dir = os.path.join(
    train_transitions_dir,
    PotentialPoint.from_config(config)[-1].str)
produce_transitions(train_original_dir, train_transitions_dir)
train_paths_dir = os.path.join('imitation_data_paths', scenario, 'train')
produce_paths(train_original_dir, train_paths_dir)

test_original_dir = os.path.join('imitation_data', scenario, 'test')
test_transitions_dir = os.path.join('imitation_data_transitions', scenario,
                                    'test')
test_transitions_dir = os.path.join(test_transitions_dir,
                                    PotentialPoint.from_config(config)[-1].str)
produce_transitions(test_original_dir, test_transitions_dir)
test_paths_dir = os.path.join('imitation_data_paths', scenario, 'test')
produce_paths(test_original_dir, test_paths_dir)


def get_files(paths_dir, transitions_dir, max_files=None):
コード例 #17
0
ファイル: network.py プロジェクト: scleronomic/ModelBasedDDPG
    def __init__(self,
                 config,
                 is_rollout_agent,
                 image_shape=(55, 111),
                 number_of_joints=4,
                 pose_dimensions=2,
                 pre_trained_reward=None,
                 name_prefix=None):
        self.name_prefix = os.getpid() if name_prefix is None else name_prefix
        self.config = config
        self.potential_points = PotentialPoint.from_config(config)

        # input related data
        self.image_shape = image_shape
        self.number_of_joints = number_of_joints
        self.pose_dimensions = pose_dimensions

        # generate inputs
        all_inputs = self._create_inputs()
        self.joints_inputs = all_inputs[0]
        self.workspace_image_inputs = all_inputs[1]
        self.goal_joints_inputs = all_inputs[2]
        self.goal_pose_inputs = all_inputs[3]

        # images for vision
        self.images_3d = None
        if self.workspace_image_inputs is not None:
            self.images_3d = tf.expand_dims(self.workspace_image_inputs,
                                            axis=-1)

        # since we take partial derivatives w.r.t subsets of the parameters, we always need to remember which parameters
        # are currently being added. note that this also causes the model to be non thread safe, therefore the creation
        # must happen sequentially

        # online actor network
        variable_count = len(tf.trainable_variables())
        actor_results = self._create_actor_network(self.joints_inputs,
                                                   is_online=True,
                                                   reuse_flag=False)
        self.online_action = actor_results[0]
        online_actor_tanh = actor_results[1]
        self.online_actor_params = tf.trainable_variables()[variable_count:]

        # create placeholders and assign ops to set these weights manually (used by rollout agents)
        self.online_actor_parameter_weights_placeholders = {
            var.name: tf.placeholder(tf.float32, var.get_shape())
            for var in self.online_actor_params
        }
        self.online_actor_parameters_assign_ops = [
            tf.assign(
                var,
                self.online_actor_parameter_weights_placeholders[var.name])
            for var in self.online_actor_params
        ]

        # target actor network
        variable_count = len(tf.trainable_variables())
        actor_results = self._create_actor_network(self.joints_inputs,
                                                   is_online=False,
                                                   reuse_flag=False)
        self.target_action = actor_results[0]
        self.target_actor_params = tf.trainable_variables()[variable_count:]

        # create placeholders and assign ops to set these weights manually (used by rollout agents)
        self.target_actor_parameter_weights_placeholders = {
            var.name: tf.placeholder(tf.float32, var.get_shape())
            for var in self.target_actor_params
        }
        self.target_actor_parameters_assign_ops = [
            tf.assign(
                var,
                self.target_actor_parameter_weights_placeholders[var.name])
            for var in self.target_actor_params
        ]

        # this is as much as a rollout agent needs
        if is_rollout_agent:
            return

        tau = self.config['model']['tau']
        gamma = self.config['model']['gamma']
        use_reward_model = self.config['model']['use_reward_model']
        self.forward_model_next_state, self.forward_model_action, forward_model_tanh = None, None, None
        if use_reward_model:
            # deterministic value of the next state (from current state, executing the online action)
            self.forward_model_next_state = self._next_state_model(
            ) if use_reward_model else None

            # online actor network for the result of the forward model
            variable_count = len(tf.trainable_variables())
            actor_results = self._create_actor_network(
                self.forward_model_next_state, is_online=True, reuse_flag=True)
            self.forward_model_action = actor_results[0]
            forward_model_tanh = actor_results[1]
            assert variable_count == len(tf.trainable_variables(
            ))  # make sure no new parameters were added

        # periodically update target actor with online actor weights
        self.update_actor_target_params = \
            [self.target_actor_params[i].assign(
                tf.multiply(self.online_actor_params[i], tau) + tf.multiply(self.target_actor_params[i], 1. - tau)
            ) for i in range(len(self.target_actor_params))]

        # create inputs for the critic and reward network when using a constant action
        self.action_inputs = tf.placeholder(tf.float32,
                                            (None, self.number_of_joints),
                                            name='action_inputs')

        # online critic for predicting the q value for a specific joints+action pair
        variable_count = len(tf.trainable_variables())
        self.online_q_value_fixed_action = self._create_critic_network(
            self.joints_inputs,
            self.action_inputs,
            is_online=True,
            reuse_flag=False,
            add_regularization_loss=True)
        online_critic_params = tf.trainable_variables()[variable_count:]

        # online critic for predicting the q value for actor update.
        # if using a reward model, the joint inputs are given by the forward model and so are the actions.
        # if in regular ddpg, the joints inputs are given by the current state inputs, the actions are the policy on
        # these joints.
        variable_count = len(tf.trainable_variables())
        self.online_q_value_under_policy = self._create_critic_network(
            joints_input=self.forward_model_next_state
            if use_reward_model else self.joints_inputs,
            action_input=self.forward_model_action
            if use_reward_model else self.online_action,
            is_online=True,
            reuse_flag=True,
            add_regularization_loss=False)
        assert variable_count == len(
            tf.trainable_variables())  # make sure no new parameters were added

        # target critic network, predicting the q value current state under the target policy
        variable_count = len(tf.trainable_variables())
        self.target_q_value_under_policy = self._create_critic_network(
            self.joints_inputs,
            self.target_action,
            is_online=False,
            reuse_flag=False,
            add_regularization_loss=False)
        target_critic_params = tf.trainable_variables()[variable_count:]

        # periodically update target critic with online critic weights
        self.update_critic_target_params = \
            [target_critic_params[i].assign(
                tf.multiply(online_critic_params[i], tau) + tf.multiply(target_critic_params[i], 1. - tau)
            ) for i in range(len(target_critic_params))]

        self.fixed_action_reward, self.fixed_action_termination, self.online_action_reward, self.online_action_termination = None, None, None, None
        if use_reward_model:
            assert pre_trained_reward is not None
            variable_count = len(tf.trainable_variables())
            # reward network to predict the immediate reward of a given action
            self.fixed_action_reward, fixed_action_status = pre_trained_reward.create_reward_network(
                self.joints_inputs, self.action_inputs,
                self.goal_joints_inputs, self.goal_pose_inputs, self.images_3d)
            self.fixed_action_termination = self._compute_termination_from_status(
                fixed_action_status)
            # reward network to predict the immediate reward of the online policy action
            self.online_action_reward, online_action_status = pre_trained_reward.create_reward_network(
                self.joints_inputs, self.online_action,
                self.goal_joints_inputs, self.goal_pose_inputs, self.images_3d)
            self.online_action_termination = self._compute_termination_from_status(
                online_action_status)
            assert variable_count == len(tf.trainable_variables())

        # the label to use to train the online critic network
        self.scalar_label = tf.placeholder(tf.float32, [None, 1])

        batch_size = tf.cast(tf.shape(self.joints_inputs)[0], tf.float32)

        # critic optimization
        critic_prediction_loss = tf.losses.mean_squared_error(
            self.scalar_label, self.online_q_value_fixed_action)
        critic_regularization = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        critic_regularization_loss = tf.add_n(
            critic_regularization) if len(critic_regularization) > 0 else 0.0
        self.critic_total_loss = critic_prediction_loss + critic_regularization_loss

        self.critic_initial_gradients_norm, self.critic_clipped_gradients_norm, self.optimize_critic = \
            self._optimize_by_loss(
                self.critic_total_loss, online_critic_params, self.config['critic']['learning_rate'],
                self.config['critic']['gradient_limit']
            )

        # summaries for the critic optimization
        self.critic_optimization_summaries = tf.summary.merge([
            tf.summary.scalar('critic_prediction_loss',
                              critic_prediction_loss),
            tf.summary.scalar('critic_regularization_loss',
                              critic_regularization_loss),
            tf.summary.scalar('critic_total_loss', self.critic_total_loss),
            tf.summary.scalar('critic_gradients_norm_initial',
                              self.critic_initial_gradients_norm),
            tf.summary.scalar('critic_gradients_norm_clipped',
                              self.critic_clipped_gradients_norm),
            tf.summary.scalar('critic_mean_prediction',
                              tf.reduce_mean(
                                  self.online_q_value_fixed_action)),
            tf.summary.histogram('critic_prediction_distribution',
                                 self.online_q_value_fixed_action),
        ])

        # when training the actor we derive the advantage w.r.t mu's network params (mu is the online policy)
        if use_reward_model:
            # advantage is r(s, mu(s)) + \gamma * q(f(s, mu(s)), mu(f(s, mu(s))))
            include_next_state = (1.0 - self.online_action_termination)
            # include_next_state = 1.0
            self.actor_loss = -(
                self.online_action_reward +
                gamma * self.online_q_value_under_policy * include_next_state
                # this is actually the policy on the forward model output
            )
        else:
            # advantage is q(s, mu(s))
            self.actor_loss = -self.online_q_value_under_policy
        self.actor_loss = tf.reduce_sum(self.actor_loss)
        # if we have extra losses for the actor:
        tanh_loss_summary = None
        if self.config['action_predictor'][
                'tanh_preactivation_loss_coefficient'] > 0.0:
            tanh_preactivation_loss = tf.losses.mean_squared_error(
                tf.zeros_like(online_actor_tanh), online_actor_tanh)
            if use_reward_model:
                forward_model_tanh_preactivation_loss = tf.losses.mean_squared_error(
                    tf.zeros_like(forward_model_tanh), forward_model_tanh)
                tanh_preactivation_loss += forward_model_tanh_preactivation_loss
            tanh_preactivation_loss *= self.config['action_predictor'][
                'tanh_preactivation_loss_coefficient']
            self.actor_loss += tanh_preactivation_loss
            tanh_loss_summary = tf.summary.scalar('tanh_preactivation_loss',
                                                  tanh_preactivation_loss)

        # divide by the batch size
        self.actor_loss = tf.div(self.actor_loss, batch_size)

        self.actor_initial_gradients_norm, self.actor_clipped_gradients_norm, self.optimize_actor = \
            self._optimize_by_loss(
                self.actor_loss, self.online_actor_params, self.config['actor']['learning_rate'],
                self.config['actor']['gradient_limit']
            )

        # summaries for the optimization
        merge_list = [
            tf.summary.scalar('actor_gradients_norm_initial',
                              self.actor_initial_gradients_norm),
            tf.summary.scalar('actor_gradients_norm_clipped',
                              self.actor_clipped_gradients_norm),
            tf.summary.scalar('actor_total_loss', self.actor_loss),
        ]
        if tanh_loss_summary is not None:
            merge_list.append(tanh_loss_summary)
        self.actor_optimization_summaries = tf.summary.merge(merge_list)
コード例 #18
0
number_of_unzippers = config['general']['number_of_unzippers']

train = Oversampler(train_data_dir,
                    batch_size,
                    oversample_goal,
                    oversample_collision,
                    number_of_unzippers=number_of_unzippers)
test = Oversampler(test_data_dir,
                   batch_size,
                   oversample_goal,
                   oversample_collision,
                   number_of_unzippers=number_of_unzippers)

# get openrave manager
openrave_manager = OpenraveManager(0.001, PotentialPoint.from_config(config))

# set summaries and saver dir
summaries_dir = os.path.join('reward', 'tensorboard')
train_summary_writer = tf.summary.FileWriter(
    os.path.join(summaries_dir, 'train_' + model_name))
test_summary_writer = tf.summary.FileWriter(
    os.path.join(summaries_dir, 'test_' + model_name))
saver_dir = os.path.join('reward', 'model', model_name)
if not os.path.exists(saver_dir):
    os.makedirs(saver_dir)

# save the config
config_copy_path = os.path.join(saver_dir, 'config.yml')
yaml.dump(config, open(config_copy_path, 'w'))
コード例 #19
0
import tensorflow as tf
import os
import yaml

from openrave_manager import OpenraveManager
from potential_point import PotentialPoint

is_gpu = tf.test.is_gpu_available()
config_path = os.path.join(os.getcwd(), 'config/config.yml')
with open(config_path, 'r') as yml_file:
    config = yaml.load(yml_file)
potential_points = PotentialPoint.from_config(config)
openrave_manager = OpenraveManager(0.01, potential_points)
random_joints = openrave_manager.get_random_joints()

print 'has gpu result {}'.format(is_gpu)
print 'random joints result {}'.format(random_joints)