def _build(self): variant = copy.deepcopy(self._variant) print(variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant(variant, training_environment) policy = self.policy = get_policy_from_variant(variant, training_environment, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, sampler=sampler, session=self._session) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def _build(self): """ called by tune to build algorithm """ variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) mjc_model_environment = self.mjc_model_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant(variant, training_environment) policy = self.policy = get_policy_from_variant(variant, training_environment, Qs, self._session) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) #### get termination function domain = environment_params['training']['domain'] static_fns = mbpo.static[domain.lower()] #### #### build algorithm self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, mjc_model_environment=mjc_model_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, static_fns=static_fns, sampler=sampler, session=self._session) initialize_tf_variables(self._session, only_uninitialized=True) # add graph since ray doesn't seem to automatically add that graph_writer = tf.summary.FileWriter(self.logdir, self._session.graph) graph_writer.flush() graph_writer.close() #### finalize graph # tf.get_default_graph().finalize() ### good for debugging, but interferes with Qs on SAC self._built = True
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': (training_environment.observation_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) self.sampler.seed = variant['run_params']['seed'] print(sampler.seed, self.sampler.seed) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
def load_policy(path): with open(path, "rb") as f: checkpoint = pickle.load(f) variant = checkpoint["variant"] env_params = variant["environment_params"]["training"] alice_params = variant["alice"] bob_params = variant["bob"] num_skills = alice_params["algorithm_params"]["discriminator_params"][ "num_skills"] # bob policy env = get_environment_from_params(env_params) bob_policy = get_policy_from_variant(bob_params, env) bob_policy.set_weights(checkpoint["policy_weights"]["bob"]) bob_policy._deterministic = True # alice policy env._observation_space.spaces["diayn"] = gym.spaces.Box( low=np.repeat(0, num_skills), high=np.repeat(1, num_skills), ) env.observation_keys += ("diayn", ) alice_policy = get_policy_from_variant(alice_params, env) alice_policy.set_weights(checkpoint["policy_weights"]["alice"]) alice_policy._deterministic = True return env, alice_policy, bob_policy, num_skills
def init_policy(): session = tf.keras.backend.get_session() checkpoint_path = CHECKPOINT_PATH.rstrip('/') experiment_path = os.path.dirname(checkpoint_path) variant_path = os.path.join(experiment_path, 'params.pkl') with open(variant_path, 'rb') as f: variant = pickle.load(f) with session.as_default(): pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) environment_params = (variant['environment_params']['evaluation'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) environment_params['n_parallel_envs'] = 1 evaluation_environment = get_environment_from_params(environment_params) policy = get_policy_from_variant(variant, evaluation_environment) policy.set_weights(picklable['policy_weights']) Qs = get_Q_function_from_variant(variant, evaluation_environment) for i, Q in enumerate(Qs): Qs[i].load_weights(os.path.join(checkpoint_path, 'Qs_{}'.format(i))) return policy, Qs
def load_environment(variant): environment_params = (variant['environment_params']['training'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) environment = get_environment_from_params(environment_params) return environment
def get_ddl_goal_state_from_variant(variant): train_env_params = variant['environment_params']['training'] env = get_environment_from_params(train_env_params) universe = train_env_params['universe'] domain = train_env_params['domain'] task = train_env_params['task'] if task in ['Valve3PickupFixed-v0']: try: env_path = os.path.join( goal_directory, GOAL_PATH_PER_UNIVERSE_DOMAIN_TASK[universe][domain][task]) pkl_path = os.path.join(env_path, 'positives.pkl') with open(pkl_path, 'rb') as f: goal_state = pickle.load(f) except KeyError: raise NotImplementedError else: domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][ domain] gen_func = domain_generators.get(task, domain_generators[DEFAULT_TASK_KEY]) goal_state = gen_func(env, include_transitions=False, num_total_examples=1, goal_threshold=0.0) goal_state = {key: val[0] for key, val in goal_state.items()} return goal_state
def simulate_policy(args): session = tf.keras.backend.get_session() checkpoint_path = args.checkpoint_path.rstrip('/') experiment_path = os.path.dirname(checkpoint_path) variant_path = os.path.join(experiment_path, 'params.pkl') with open(variant_path, 'rb') as f: variant = pickle.load(f) with session.as_default(): pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) environment_params = (variant['environment_params']['evaluation'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) evaluation_environment = get_environment_from_params(environment_params) evaluation_environment.seed(variant['run_params']['seed']) if args.record_video: video_dir = os.path.join(experiment_path, 'test-video') evaluation_environment._env = wrappers.Monitor( evaluation_environment._env, video_dir, force=True) policy = (get_policy_from_variant(variant, evaluation_environment)) policy.set_weights(picklable['policy_weights']) render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs} with policy.set_deterministic(args.deterministic): paths = rollouts(args.num_rollouts, evaluation_environment, policy, path_length=args.max_path_length, render_kwargs=render_kwargs) if not args.record_video: evaluation_metrics = evaluate_rollouts(paths, evaluation_environment) evaluation_file_path = os.path.join(experiment_path, 'final_eval.csv') with open(evaluation_file_path, 'w') as f: w = csv.DictWriter(f, evaluation_metrics.keys()) w.writeheader() w.writerow(evaluation_metrics) if args.render_kwargs.get('mode') == 'rgb_array': fps = 1 // getattr(evaluation_environment, 'dt', 1 / 30) for i, path in enumerate(paths): video_save_dir = os.path.expanduser('/tmp/simulate_policy/') video_save_path = os.path.join(video_save_dir, f'episode_{i}.mp4') save_video(path['images'], video_save_path, fps=fps) return paths
def build(self): environment_params = self.variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) replay_pool = self.replay_pool = (get_replay_pool_from_variant( self.variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(self.variant) Qs = self.Qs = get_Q_function_from_variant(self.variant, training_environment) policy = self.policy = get_policy_from_variant(self.variant, training_environment, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) #### get termination function domain = environment_params['training']['domain'] static_fns = static[domain.lower()] #### log_path = './log/%s' % (self.variant['algorithm_params']['domain']) if (not os.path.exists(log_path)): os.makedirs(log_path) self.algorithm = get_algorithm_from_variant( variant=self.variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, static_fns=static_fns, sampler=sampler, session=self._session, log_file='./log/%s/%d.log' % (self.variant['algorithm_params']['domain'], time.time())) initialize_tf_variables(self._session, only_uninitialized=True)
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) seed = variant['run_params']['seed'] training_environment.seed(seed) # Set a different seed for the evaluation env # to ensure the policy is not just memorizing action sequences for seen initial states evaluation_environment.seed(seed + 10) replay_pool = self.replay_pool = ( get_replay_pool_from_variant(variant, training_environment)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant( variant, training_environment) policy = self.policy = get_policy_from_variant( variant, training_environment, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', training_environment)) self.algorithm = get_algorithm_from_variant( variant=self._variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, sampler=sampler, session=self._session) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def load_policy_and_environment(picklable, variant): environment_params = (variant['environment_params']['training'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) environment = get_environment_from_params(environment_params) policy = get_policy_from_variant(variant, environment) policy.set_weights(picklable['policy_weights']) return policy, environment
def load_policy(fpath, itr='last', deterministic=False): # handle which epoch to load from if itr=='last': saves = [int(x[11:]) for x in os.listdir(fpath) if 'simple_save' in x and len(x)>11] itr = '%d'%max(saves) if len(saves) > 0 else '' else: itr = '%d'%itr # load the things! gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) tf.keras.backend.set_session(sess) sess = tf.keras.backend.get_session() #sess = tf.Session(graph=tf.Graph()) saver = Saver() model = saver.restore_tf_graph(sess, fpath) # get the correct op for executing actions if deterministic and 'mu' in model.keys(): # 'deterministic' is only a valid option for SAC policies print('Using deterministic action op.') action_op = model['mu'] else: print('Using default action op.') action_op = model['pi'] # make function for producing an action given a single state get_action = lambda x : sess.run(action_op, feed_dict={model['x']: x}) # try to load environment from save # (sometimes this will fail because the environment could not be pickled) try: state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl')) env = state['env'] except: environment_params = {} environment_params['universe'] = 'gym' environment_params['task'] = 'v2' environment_params['domain'] = 'HumanoidSafe' environment_params['kwargs'] = {} env = get_environment_from_params(environment_params) # env = wrappers.Monitor(env, '/home/uvday/ray_mbpo/AntSafe/', force = True) return env, get_action, sess
def get_ddl_goal_state_from_variant(variant): train_env_params = variant['environment_params']['training'] env = get_environment_from_params(train_env_params) universe = train_env_params['universe'] domain = train_env_params['domain'] task = train_env_params['task'] domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][domain] gen_func = domain_generators.get(task, domain_generators[DEFAULT_TASK_KEY]) goal_state = gen_func(env, include_transitions=False, num_total_examples=1, goal_threshold=0.0) goal_state = {key: val[0] for key, val in goal_state.items()} return goal_state
def get_policy(checkpoint_path): checkpoint_path = checkpoint_path.rstrip('/') experiment_path = os.path.dirname(checkpoint_path) variant_path = os.path.join(experiment_path, 'params.json') with open(variant_path, 'r') as f: variant = json.load(f) environment_params = ( variant['environment_params']['evaluation'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) evaluation_environment = get_environment_from_params(environment_params) policy = (get_policy_from_variant(variant, evaluation_environment, Qs=[None])) training_environment = get_environment_from_params_custom(environment_params) return policy, training_environment
def simulate_policy(args): session = tf.keras.backend.get_session() checkpoint_path = args.checkpoint_path.rstrip('/') experiment_path = os.path.dirname(checkpoint_path) variant_path = os.path.join(experiment_path, 'params.json') with open(variant_path, 'r') as f: variant = json.load(f) with session.as_default(): pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) environment_params = (variant['environment_params']['evaluation'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) evaluation_environment = get_environment_from_params(environment_params) policy = (get_policy_from_variant(variant, evaluation_environment, Qs=[None])) policy.set_weights(picklable['policy_weights']) with policy.set_deterministic(args.deterministic): paths = rollouts(args.num_rollouts, evaluation_environment, policy, path_length=args.max_path_length, render_mode=args.render_mode) #### print rewards rewards = [path['rewards'].sum() for path in paths] print('Rewards: {}'.format(rewards)) print('Mean: {}'.format(np.mean(rewards))) #### if args.render_mode != 'human': from pprint import pprint import pdb pdb.set_trace() pass return paths
def simulate_policy(args): session = tf.keras.backend.get_session() checkpoint_path = args.checkpoint_path.rstrip('/') experiment_path = os.path.dirname(checkpoint_path) variant_path = os.path.join(experiment_path, 'params.pkl') with open(variant_path, 'rb') as f: variant = pickle.load(f) with session.as_default(): pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) environment_params = ( variant['environment_params']['evaluation'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) evaluation_environment = get_environment_from_params(environment_params) policy = ( get_policy_from_variant(variant, evaluation_environment)) policy.set_weights(picklable['policy_weights']) render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs} with policy.set_deterministic(args.deterministic): paths = rollouts(args.num_rollouts, evaluation_environment, policy, path_length=args.max_path_length, render_kwargs=render_kwargs) if args.render_kwargs.get('mode') == 'rgb_array': for i, path in enumerate(paths): video_save_dir = os.path.expanduser('/tmp/simulate_policy/') video_save_path = os.path.join(video_save_dir, f'episode_{i}.avi') save_video(path['images'], video_save_path) return paths
def _build(self): variant = copy.deepcopy(self._variant) print(variant.keys()) env = self.env = get_environment_from_params( variant['environment_params']['training']) replay_pool = self.replay_pool = (get_replay_pool_from_variant( variant, env)) sampler = self.sampler = get_sampler_from_variant(variant) Qs = self.Qs = get_Q_function_from_variant(variant, env) policy = self.policy = get_policy_from_variant(variant, env, Qs) initial_exploration_policy = self.initial_exploration_policy = ( get_policy('UniformPolicy', env)) algorithm_kwargs = { 'variant': self._variant, 'env': self.env, 'policy': policy, 'initial_exploration_policy': initial_exploration_policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler, 'session': self._session, } if self._variant['algorithm_params']['type'] in CLASSIFIER_RL_ALGS: reward_classifier = self.reward_classifier \ = get_reward_classifier_from_variant(self._variant, env) algorithm_kwargs['classifier'] = reward_classifier goal_examples_train, goal_examples_validation = \ get_goal_example_from_variant(variant) algorithm_kwargs['goal_examples'] = goal_examples_train algorithm_kwargs['goal_examples_validation'] = \ goal_examples_validation self.algorithm = get_algorithm_from_variant(**algorithm_kwargs) initialize_tf_variables(self._session, only_uninitialized=True) self._built = True
def get_goal_transitions_from_variant(variant): """ Returns SQIL goal transitions (s, a, s', r = 1) """ train_env_params = variant['environment_params']['training'] env = get_environment_from_params(train_env_params) universe = train_env_params['universe'] domain = train_env_params['domain'] task = train_env_params['task'] try: # TODO: Add goal generation kwargs (goal threshold, etc.) domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][ domain] gen_func = domain_generators.get(task, domain_generators[DEFAULT_TASK_KEY]) goal_transitions = gen_func(env, include_transitions=True) except KeyError: raise NotImplementedError return goal_transitions
def main(): import sys example_args = get_parser().parse_args(sys.argv[1:]) variant_spec = get_variant_spec(example_args) command_line_args = example_args print('vriant spec: {}'.format(variant_spec)) params = variant_spec.get('algorithm_params') local_dir = os.path.join(params.get('log_dir'), params.get('domain')) resources_per_trial = _normalize_trial_resources( command_line_args.resources_per_trial, command_line_args.trial_cpus, command_line_args.trial_gpus, command_line_args.trial_extra_cpus, command_line_args.trial_extra_gpus) experiment_id = params.get('exp_name') #### add pool_load_max_size to experiment_id if 'pool_load_max_size' in variant_spec['algorithm_params']['kwargs']: max_size = variant_spec['algorithm_params']['kwargs'][ 'pool_load_max_size'] experiment_id = '{}_{}e3'.format(experiment_id, int(max_size / 1000)) #### variant_spec = add_command_line_args_to_variant_spec( variant_spec, command_line_args) if command_line_args.video_save_frequency is not None: assert 'algorithm_params' in variant_spec variant_spec['algorithm_params']['kwargs']['video_save_frequency'] = ( command_line_args.video_save_frequency) variant = variant_spec # init set_seed(variant['run_params']['seed']) gpu_options = tf.GPUOptions(allow_growth=True) session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) tf.keras.backend.set_session(session) # build variant = copy.deepcopy(variant) tester.set_hyper_param(**variant) tester.add_record_param(['run_params.seed', 'info']) tester.configure(task_name='policy_learn', private_config_path=os.path.join(get_package_path(), 'rla_config.yaml'), run_file='main.py', log_root=get_package_path()) tester.log_files_gen() tester.print_args() environment_params = variant['environment_params'] training_environment = (get_environment_from_params( environment_params['training'])) evaluation_environment = (get_environment_from_params( environment_params['evaluation'](variant)) if 'evaluation' in environment_params else training_environment) replay_pool = (get_replay_pool_from_variant(variant, training_environment)) sampler = get_sampler_from_variant(variant) Qs = get_Q_function_from_variant(variant, training_environment) policy = get_policy_from_variant(variant, training_environment, Qs) initial_exploration_policy = (get_policy('UniformPolicy', training_environment)) #### get termination function domain = environment_params['training']['domain'] static_fns = mopo.static[domain.lower()] #### print("[ DEBUG ] KWARGS: {}".format(variant['algorithm_params']['kwargs'])) algorithm = get_algorithm_from_variant( variant=variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, initial_exploration_policy=initial_exploration_policy, Qs=Qs, pool=replay_pool, static_fns=static_fns, sampler=sampler, session=session) print('[ DEBUG ] finish construct model, start training') # train list(algorithm.train())
def _build(self): ''' variant['something params']是关于 something 的创建参数, 其中又包含 variant['something params']['class_name'] 和 variant['something params']['config'] 两项。 用这两项可以创建一个对象实例 ''' variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': (training_environment.observation_shape, training_environment.action_shape), }) # 根据配置获取一个函数(包含神经网络)的实例 Qs = self.Qs = tree.flatten(value_functions.get(variant['Q_params'])) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) # 参考 value_functions.get, 根据配置获取实例 replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置(config)赋值 variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) # 用上层配置创建上层对象 sampler = self.sampler = samplers.get(variant['sampler_params']) # 用 variant 中的下层配置创建下层对象,并用下层对象给上层配置赋值 variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) # 用上层配置创建上层对象,创建 RL 算法,包含所有运算模块 self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True
from softlearning.environments.utils import get_environment_from_params import gym params = { 'universe': 'gym', 'domain': 'Point2D', 'task': 'Fixed-v0', 'kwargs': { 'normalize': False, 'init_pos_range': ((0, 0), (0, 0)), 'target_pos_range': ((-2, -2), (2, 2)), 'observation_keys': ('state_observation', 'state_desired_goal'), } } env = get_environment_from_params(params) # for _ in range(100): env.reset() for _ in range(10): env.step(env.action_space.sample()) env.render()
def main(variant_in): variant = copy.deepcopy(variant_in) environment_params = variant['environment_params'] training_environment = get_environment_from_params(environment_params['training']) evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment ) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape), }) Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = replay_pools.get(variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = samplers.get(variant['sampler_params']) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) algorithm = algorithms.get(variant['algorithm_params']) print("Initialization finished") train_generator = None # it will iterate through the number of epochs 'n_epochs' # during epoch: # it will sample 'epoch_length' number of times (reset is not counted) to the pool # also, it will train each step, if there are more samples than 'min_pool_size' in the replay pool for i in count(): if train_generator is None: train_generator = algorithm.train() diagnostics = next(train_generator) # it should be before printing to prevent a double print the last epoch try: if diagnostics['done']: break except KeyError: pass evalu_reward = diagnostics["evaluation"]["episode-reward-mean"] print(f"Evaluation: reward mean is {evalu_reward}") # train_reward = diagnostics["training"]["episode-reward-mean"] # print(f"Training: reward mean is {train_reward}") print("Finish") return policy
def get_goal_example_from_variant(variant): train_env_params = variant['environment_params']['training'] env = get_environment_from_params(train_env_params) total_goal_examples = ( variant['data_params']['n_goal_examples'] + variant['data_params']['n_goal_examples_validation_max']) universe = train_env_params['universe'] domain = train_env_params['domain'] task = train_env_params['task'] if task in DOOR_TASKS: goal_examples = generate_door_goal_examples(total_goal_examples, env) elif task in PUSH_TASKS: goal_examples = generate_push_goal_examples(total_goal_examples, env) elif task in PICK_TASKS: goal_examples = generate_pick_goal_examples(total_goal_examples, env, variant['task']) elif SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK.get(universe, {}).get(domain, None): domain_generators = SUPPORTED_ENVS_UNIVERSE_DOMAIN_TASK[universe][ domain] gen_func = domain_generators.get(task, domain_generators[DEFAULT_TASK_KEY]) include_transitions = ( variant['algorithm_params']['type'] == 'VICEDynamicsAware') goal_examples = gen_func(env, include_transitions=include_transitions, num_total_examples=total_goal_examples) else: try: env_path = os.path.join( goal_directory, GOAL_PATH_PER_UNIVERSE_DOMAIN_TASK[universe][domain][task]) pkl_path = os.path.join(env_path, 'positives.pkl') with open(pkl_path, 'rb') as f: goal_examples = pickle.load(f) except KeyError: raise NotImplementedError n_goal_examples = variant['data_params']['n_goal_examples'] # total_samples = len(goal_examples[next(iter(goal_examples))]) # Shuffle the goal images before assigning training/validation shuffle = np.random.permutation(total_goal_examples) train_indices = shuffle[:n_goal_examples] valid_indices = shuffle[n_goal_examples:] goal_examples_train = dict([ (key, {obs_key: value[obs_key][train_indices] for obs_key in value}) if isinstance(value, dict) else (key, value[train_indices]) for key, value in goal_examples.items() ]) goal_examples_validation = dict([ (key, {obs_key: value[obs_key][valid_indices] for obs_key in value}) if isinstance(value, dict) else (key, value[valid_indices]) for key, value in goal_examples.items() ]) return goal_examples_train, goal_examples_validation
def simulate_policy(args): session = tf.keras.backend.get_session() checkpoint_path = args.checkpoint_path.rstrip('/') experiment_path = os.path.dirname(checkpoint_path) variant_path = os.path.join(experiment_path, 'params.pkl') with open(variant_path, 'rb') as f: variant = pickle.load(f) with session.as_default(): pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) import ipdb ipdb.set_trace() environment_params = (variant['environment_params']['evaluation'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) if args.use_state_estimator: environment_params['kwargs'].update({ 'pixel_wrapper_kwargs': { 'pixels_only': False, 'normalize': False, 'render_kwargs': { 'width': 32, 'height': 32, 'camera_id': -1, } }, 'camera_settings': { 'azimuth': 180, 'distance': 0.35, 'elevation': -55, 'lookat': (0, 0, 0.03), }, }) # obs_keys = environment_params['kwargs'].pop('observation_keys') # non_object_obs_keys = [obs_key for obs_key in obs_keys if 'object' not in obs_key] # non_object_obs_keys.append('pixels') # environment_params['kwargs']['observation_keys'] = tuple(non_object_obs_keys) # if args.render_mode == 'human': # if 'has_renderer' in environment_params['kwargs'].keys(): # environment_params['kwargs']['has_renderer'] = True # variant['environment_params']['evaluation']['task'] = 'TurnFreeValve3ResetFree-v0' # variant['environment_params']['evaluation']['kwargs']['reset_from_corners'] = True # 'reward_keys': ( # 'object_to_target_position_distance_cost', # 'object_to_target_orientation_distance_cost', # ), # 'swap_goal_upon_completion': False, # } evaluation_environment = get_environment_from_params(environment_params) policy = (get_policy_from_variant(variant, evaluation_environment)) policy.set_weights(picklable['policy_weights']) dump_path = os.path.join(checkpoint_path, 'policy_params.pkl') with open(dump_path, 'wb') as f: pickle.dump(picklable['policy_weights'], f) render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs} from softlearning.preprocessors.utils import get_state_estimator_preprocessor state_estimator = get_state_estimator_preprocessor( state_estimator_path= '/home/justinvyu/dev/softlearning-vice/softlearning/models/state_estimators/state_estimator_fixed_antialias.h5', num_hidden_units=256, num_hidden_layers=2) sampler_kwargs = { 'state_estimator': state_estimator, 'replace_state': True, } with policy.set_deterministic(args.deterministic): paths = rollouts(args.num_rollouts, evaluation_environment, policy, path_length=args.max_path_length, render_kwargs=render_kwargs, sampler_kwargs=sampler_kwargs) if args.render_kwargs.get('mode') == 'rgb_array': fps = 2 // getattr(evaluation_environment, 'dt', 1 / 30) for i, path in enumerate(paths): video_save_dir = args.checkpoint_path # video_save_dir = os.path.expanduser('/tmp/simulate_policy/') video_save_path = os.path.join(video_save_dir, f'episode_{i}.mp4') save_video(path['images'], video_save_path, fps=fps) return paths
def simulate_policy(args): session = tf.keras.backend.get_session() checkpoint_path = args.checkpoint_path.rstrip('/') experiment_path = os.path.dirname(checkpoint_path) variant_path = os.path.join(experiment_path, 'params.pkl') with open(variant_path, 'rb') as f: variant = pickle.load(f) checkpoint_paths = [ checkpoint_dir for checkpoint_dir in sorted( glob.iglob(os.path.join(experiment_path, 'checkpoint_*')), key=lambda d: float(d.split("checkpoint_")[1])) ] dump_dir = os.path.join(experiment_path, 'evaluations/') if not os.path.exists(dump_dir): os.makedirs(dump_dir) all_paths = [] for checkpoint_dir in checkpoint_paths[::2]: with session.as_default(): pickle_path = os.path.join(checkpoint_dir, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) environment_params = (variant['environment_params']['evaluation'] if 'evaluation' in variant['environment_params'] else variant['environment_params']['training']) environment_params['kwargs']['device_path'] = '/dev/ttyUSB0' environment_params['kwargs']['camera_config'] = { 'topic': '/kinect2_001144463747/qhd/image_color', 'image_shape': (256, 256, 3) } environment_params['kwargs']['init_pos_range'] = list((np.array([ 0, -np.pi / 4, -np.pi / 2, -3 * np.pi / 4, -np.pi, np.pi / 4, np.pi / 2, np.pi * 3 / 4 ]) + (-75 * np.pi / 180)) % (2 * np.pi) - np.pi) environment_params['kwargs']['target_pos_range'] = [-75 * np.pi / 180] environment_params['kwargs']['cycle_inits'] = True evaluation_environment = get_environment_from_params( environment_params) policy = (get_policy_from_variant(variant, evaluation_environment)) policy_weights = picklable['policy_weights'] if variant['algorithm_params']['type'] in ['MultiSAC', 'MultiVICEGAN']: policy_weights = policy_weights[0] policy.set_weights(policy_weights) # dump_path = os.path.join(checkpoint_path, 'policy_params.pkl') # with open(dump_path, 'wb') as f: # pickle.dump(picklable['policy_weights'], f) render_kwargs = {**DEFAULT_RENDER_KWARGS, **args.render_kwargs} with policy.set_deterministic(args.deterministic): paths = rollouts(args.num_rollouts, evaluation_environment, policy, path_length=args.max_path_length, render_kwargs=render_kwargs) if render_kwargs.get('mode') == 'rgb_array': fps = 2 // getattr(evaluation_environment, 'dt', 1 / 30) for i, path in enumerate(paths): # video_save_dir = os.path.expanduser('/tmp/simulate_policy/') video_save_path = os.path.join(checkpoint_dir, f'episode_{i}.mp4') save_video(path['images'], video_save_path, fps=fps) all_paths.append(paths) with open(os.path.join(dump_dir, 'evaluation_paths.pkl'), 'wb') as f: pickle.dump(all_paths, f) return paths
def _build(self): variant = copy.deepcopy(self._variant) environment_params = variant['environment_params'] training_environment = self.training_environment = ( get_environment_from_params(environment_params['training'])) evaluation_environment = self.evaluation_environment = ( get_environment_from_params(environment_params['evaluation']) if 'evaluation' in environment_params else training_environment) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape), }) Qs = self.Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = self.policy = policies.get(variant['policy_params']) variant['replay_pool_params']['config'].update({ 'environment': training_environment, }) replay_pool = self.replay_pool = replay_pools.get( variant['replay_pool_params']) variant['sampler_params']['config'].update({ 'environment': training_environment, 'policy': policy, 'pool': replay_pool, }) sampler = self.sampler = samplers.get(variant['sampler_params']) set_random_seed(variant['run_params']['seed']) save_path = os.path.join(os.path.dirname(__file__),"..","..", "results", f"logs",f"sac", f"HalfCheetahBulletEnv-v0_{variant['run_params']['seed']}") print("this is the save path: " + save_path) os.makedirs(save_path, exist_ok=True) # create wrapped environment eval_env_wrapped = TimeLimit(evaluation_environment, 1000) eval_callback = EvalCallback( eval_env_wrapped, callback_on_new_best=None, best_model_save_path=None, n_eval_episodes=10, log_path=save_path, eval_freq=10000, # TODO change hardcoded value deterministic=True, verbose=1, ) eval_callback.init_callback(policy) sampler.set_callback(eval_callback) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': replay_pool, 'sampler': sampler }) self.algorithm = algorithms.get(variant['algorithm_params']) self._built = True