def main(policy_file, seed, n_test_rollouts, render): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def launch(env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() # Added the following line dims = config.configure_dims(params) params['replay_k'] = config.DEFAULT_PARAMS['replay_k'] policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch(env_name, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, binding, logging, version, n_cycles, note, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu, binding) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if logging: logdir = 'logs/' + str(env_name) + '-replay_strategy' + str( replay_strategy) + '-n_epochs' + str(n_epochs) + '-num_cpu' + str( num_cpu) + '-seed' + str(seed) + '-n_cycles' + str( n_cycles) + '-version' + str( version) + '-T-' + datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") else: logdir = osp.join( tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() # use temp folder for other rank logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy params['binding'] = binding params['max_timesteps'] = n_epochs * params['n_cycles'] * params[ 'n_batches'] * num_cpu params['version'] = version params['n_cycles'] = n_cycles params['num_cpu'] = num_cpu params['note'] = note or params['note'] if note: with open('params/' + env_name + '/' + note + '.json', 'r') as file: override_params = json.loads(file.read()) params.update(**override_params) if params['load_weight']: if type(params['load_weight']) is list: params['load_weight'] = params['load_weight'][seed] base = os.path.splitext(params['load_weight'])[0] policy_weight_file = open(base + '_weight.pkl', 'rb') pretrain_weights = pickle.load(policy_weight_file) policy_weight_file.close() else: pretrain_weights = None if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, pretrain_weights=pretrain_weights, clip_return=clip_return) render = False if params['collect_video']: render = 'rgb_array' rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'render': render, } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, num_cpu=num_cpu, collect_data=params['collect_data'], collect_video=params['collect_video'], goal_generation=params['goal_generation'], num_skills=params['num_skills'], use_skill_n=params['use_skill_n'], batch_size=params['_batch_size'], mi_r_scale=params['mi_r_scale'], mi_end_epoch=params['mi_end_epoch'], sk_r_scale=params['sk_r_scale'], no_train_mi=params['no_train_mi'])
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) num_cpu = 1 if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) print("fancy call succeeded") except CalledProcessError: print("fancy version of mpi call failed, try simple version") whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() # Configure logging rank = MPI.COMM_WORLD.Get_rank() logdir = '' if rank == 0: if logdir or logger_b.get_dir() is None: logger_b.configure(dir=logdir) else: logger_b.configure() logdir = logger_b.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = v['seed'] + 1000000 * rank set_global_seeds(rank_seed) def make_env(): return PnPEnv() env = make_env() test_env = make_env() env.reset() # for _ in range(1000): # env.render() # import pdb; pdb.set_trace() # env.step(env.action_space.sample()) params = config.DEFAULT_PARAMS params['action_l2'] = v['action_l2'] params['max_u'] = v['max_u'] params['gamma'] = v['discount'] params['env_name'] = 'FetchReach-v0' params['replay_strategy'] = v['replay_strategy'] params['lr'] = v['lr'] params['layers'] = v['layers'] params['hidden'] = v['hidden'] params['n_cycles'] = v['n_cycles'] # cycles per epoch params['n_batches'] = v['n_batches'] # training batches per cycle params['batch_size'] = v[ 'batch_size'] # per mpi thread, measured in transitions and reduced to even multiple of chunk_length. params['n_test_rollouts'] = v[ 'n_test_rollouts'] # changed from 10 to 3 # number of test rollouts per epoch, each consists of rollout_batch_size rollouts # exploration params['random_eps'] = 0.3 # percentage of time a random action is taken params['noise_eps'] = v['action_noise'] params['goal_weight'] = v['goal_weight'] params['scope'] = 'ddpg3' params['sample_expert'] = v['sample_expert'] params['expert_batch_size'] = v['expert_batch_size'] params['bc_loss'] = v['bc_loss'] params['anneal_bc'] = v['anneal_bc'] params['gail_weight'] = v['gail_weight'] params['terminate_bootstrapping'] = v['terminate_bootstrapping'] params['mask_q'] = int(v['mode'] == 'pure_bc') params['two_qs'] = v['two_qs'] params['anneal_discriminator'] = v['anneal_discriminator'] params['two_rs'] = v['two_qs'] or v['anneal_discriminator'] params['with_termination'] = v['rollout_terminate'] if 'clip_dis' in v and v['clip_dis']: params['dis_bound'] = v['clip_dis'] with open(os.path.join(logger_b.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params['T'] = v['horizon'] params['to_goal'] = v['to_goal'] params = config.prepare_params(params) params['make_env'] = make_env config.log_params(params, logger=logger_b) dims = config.configure_dims(params) # prepare GAIL if v['use_s_p']: discriminator = GAIL(dims['o'] + dims['o'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['o'], dims['g'], 0., gail_loss=v['gail_reward'], use_s_p=True, only_s=v['only_s']) else: discriminator = GAIL(dims['o'] + dims['u'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['u'], dims['g'], 0., gail_loss=v['gail_reward'], only_s=v['only_s']) params['discriminator'] = discriminator # configure replay buffer for expert buffer params_expert = { k: params[k] for k in [ 'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination' ] } params_expert[ 'replay_strategy'] = 'future' if v['relabel_expert'] else 'none' params_policy_buffer = { k: params[k] for k in [ 'make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination' ] } params_policy_buffer['replay_strategy'] = 'future' params_empty = { k: params[k] for k in [ 'make_env', 'replay_k', 'discriminator', 'gail_weight', 'replay_strategy' ] } policy = config.configure_ddpg(dims=dims, params=params, clip_return=v['clip_return'], reuse=tf.AUTO_REUSE, env=env, to_goal=v['to_goal']) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': True, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'], 'to_goal': v['to_goal'] } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'], 'to_goal': v['to_goal'] } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker([env], policy, dims, logger_b, **rollout_params) # rollout_worker.seed(rank_seed) evaluator = RolloutWorker([env], policy, dims, logger_b, **eval_params) # evaluator.seed(rank_seed) n_traj = v['n_evaluation_traj'] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() inner_log_dir = osp.join(log_dir, 'inner_iters') report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) logger.log("Starting the outer iterations") logger.log("Generating heat map") def evaluate_pnp(env, policy, n_rollouts=100): goal_reached = [] distance_to_goal = [] for i in range(n_rollouts): traj = rollout(env, policy, max_path_length=v['horizon'], using_gym=True) goal_reached.append(np.max(traj['env_infos']['goal_reached'])) distance_to_goal.append(np.min(traj['env_infos']['distance'])) return np.mean(goal_reached), np.mean(distance_to_goal) from sandbox.experiments.goals.pick_n_place.pnp_expert import PnPExpert expert_policy = PnPExpert(env) expert_params = { 'exploit': not v['noisy_expert'], 'use_target_net': False, 'use_demo_states': False, 'compute_Q': False, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'], 'to_goal': v['to_goal'] } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: expert_params[name] = params[name] expert_params['noise_eps'] = v['expert_noise'] expert_params['random_eps'] = v['expert_eps'] expert_worker = RolloutWorker([env], expert_policy, dims, logger_b, **expert_params) input_shapes = dims_to_shapes(dims) expert_sample_transitions = config.configure_her(params_expert) buffer_shapes = { key: (v['horizon'] if key != 'o' else v['horizon'] + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], 3 if not v['full_space_as_goal'] else 6) buffer_shapes['ag'] = (v['horizon'] + 1, 3 if not v['full_space_as_goal'] else 6) buffer_shapes['successes'] = (v['horizon'], ) expert_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], expert_sample_transitions) policy.expert_buffer = expert_buffer sample_transitions_relabel = config.configure_her(params_policy_buffer) for _ in range(v['num_demos']): # rollout is generated by expert policy episode = expert_worker.generate_rollouts( slice_goal=(3, 6) if v['full_space_as_goal'] else None) # and is stored into the current expert buffer expert_buffer.store_episode(episode) # TODO: what is subsampling_rate uninitialized_vars = [] for var in tf.global_variables(): try: tf.get_default_session().run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) init_new_vars_op = tf.initialize_variables(uninitialized_vars) tf.get_default_session().run(init_new_vars_op) max_success, min_distance = evaluate_pnp(env, policy) outer_iter = 0 logger.record_tabular("Outer_iter", outer_iter) logger.record_tabular("Outer_Success", max_success) logger.record_tabular("MinDisToGoal", min_distance) logger.dump_tabular() for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) with ExperimentLogger(inner_log_dir, outer_iter, snapshot_mode='last', hold_outter_log=True): train( policy, discriminator, rollout_worker, v['inner_iters'], v['n_cycles'], v['n_batches'], v['n_batches_dis'], policy.buffer, expert_buffer, empty_buffer=empty_buffer if v['on_policy_dis'] else None, num_rollouts=v['num_rollouts'], feasible_states=feasible_states if v['query_expert'] else None, expert_policy=expert_policy if v['query_expert'] else None, agent_policy=policy if v['query_agent'] else None, train_dis_per_rollout=v['train_dis_per_rollout'], noise_expert=v['noise_dis_agent'], noise_agent=v['noise_dis_expert'], sample_transitions_relabel=sample_transitions_relabel if v['relabel_for_policy'] else None, outer_iter=outer_iter, annealing_coeff=v['annealing_coeff'], q_annealing=v['q_annealing']) print("evaluating policy performance") logger.log("Generating heat map") success, min_distance = evaluate_pnp(env, policy) logger.record_tabular("Outer_iter", outer_iter) logger.record_tabular("Outer_Success", max_success) logger.record_tabular("MinDisToGoal", min_distance) logger.dump_tabular() if success > max_success: print("% f >= %f, saving policy to params_best" % (success, max_success)) with open(osp.join(log_dir, 'params_best.pkl'), 'wb') as f: cloudpickle.dump({'env': env, 'policy': policy}, f) max_success = success report.save() report.new_row()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) if v['control_mode'] == 'linear': from sandbox.envs.maze.point_maze_env import PointMazeEnv inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) inner_env_test = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) elif v['control_mode'] == 'pos': from sandbox.envs.maze.point_maze_pos_env import PointMazeEnv inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) inner_env_test = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) num_cpu = 1 if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) print("fancy call succeeded") except CalledProcessError: print("fancy version of mpi call failed, try simple version") whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() # Configure logging rank = MPI.COMM_WORLD.Get_rank() logdir = '' if rank == 0: if logdir or logger_b.get_dir() is None: logger_b.configure(dir=logdir) else: logger_b.configure() logdir = logger_b.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = v['seed'] + 1000000 * rank set_global_seeds(rank_seed) feasible_states = sample_unif_feas(inner_env, 10) if v['unif_starts']: starts = np.random.permutation(np.array(feasible_states))[:300] else: starts = np.array([[0, 0]]) uniform_start_generator = UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) # Prepare params. def make_env(inner_env=inner_env, terminal_eps=v['terminal_eps'], terminate_env=v['terminate_env']): return GoalStartExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['goal_size']], terminal_eps=terminal_eps, distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=terminate_env, goal_weight=v['goal_weight'], inner_weight=0, append_goal_to_observation=False ) env = make_env() test_env = make_env(inner_env=inner_env_test, terminal_eps=1., terminate_env=True) params = config.DEFAULT_PARAMS params['action_l2'] = v['action_l2'] params['max_u'] = v['max_u'] params['gamma'] = v['discount'] params['env_name'] = 'FetchReach-v0' params['replay_strategy'] = v['replay_strategy'] params['lr'] = v['lr'] params['layers'] = v['layers'] params['hidden'] = v['hidden'] params['n_cycles'] = v['n_cycles'] # cycles per epoch params['n_batches'] = v['n_batches'] # training batches per cycle params['batch_size'] = v['batch_size'] # per mpi thread, measured in transitions and reduced to even multiple of chunk_length. params['n_test_rollouts'] = v['n_test_rollouts'] # changed from 10 to 3 # number of test rollouts per epoch, each consists of rollout_batch_size rollouts # exploration params['random_eps'] = 0.3 # percentagcone of time a random action is taken params['noise_eps'] = v['action_noise'] params['goal_weight'] = v['goal_weight'] with open(os.path.join(logger_b.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params['T'] = v['horizon'] params['to_goal'] = v['to_goal'] params['nearby_action_penalty'] = v['nearby_action_penalty'] params['nearby_penalty_weight'] = v['nearby_penalty_weight'] params['nearby_p'] = v['nearby_p'] params['perturb_scale'] = v['perturb_scale'] params['cells_apart'] = v['cells_apart'] params['perturb_to_feasible'] = v['perturb_to_feasible'] params['sample_expert'] = v['sample_expert'] params['expert_batch_size'] = v['expert_batch_size'] params['bc_loss'] = v['bc_loss'] params['anneal_bc'] = v['anneal_bc'] params['gail_weight'] =v['gail_weight'] params['terminate_bootstrapping'] = v['terminate_bootstrapping'] params['mask_q'] = int(v['mode'] == 'pure_bc') params['two_qs'] = v['two_qs'] params['anneal_discriminator'] = v['anneal_discriminator'] params['two_rs'] = v['two_qs'] or v['anneal_discriminator'] params['with_termination'] = v['rollout_terminate'] if 'clip_dis' in v and v['clip_dis']: params['dis_bound'] = v['clip_dis'] params = config.prepare_params(params) params['make_env'] = make_env config.log_params(params, logger=logger_b) dims = config.configure_dims(params) # prepare GAIL if v['use_s_p']: discriminator = GAIL(dims['o'] + dims['o'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['o'], dims['g'], 0., gail_loss = v['gail_reward'], use_s_p = True, only_s=v['only_s']) else: discriminator = GAIL(dims['o'] + dims['u'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['u'], dims['g'], 0., gail_loss = v['gail_reward'], only_s=v['only_s']) params['discriminator'] = discriminator # configure replay buffer for expert buffer params_expert = {k:params[k] for k in ['make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination']} params_expert['replay_strategy'] = 'future' if v['relabel_expert'] else 'none' params_expert['sample_g_first'] = v['relabel_expert'] and v['sample_g_first'] params_expert['zero_action_p'] = v['zero_action_p'] params_policy_buffer = {k: params[k] for k in ['make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination']} params_policy_buffer['replay_strategy'] = 'future' params_policy_buffer['sample_g_first'] = False policy = config.configure_ddpg(dims=dims, params=params, clip_return=v['clip_return'], reuse=tf.AUTO_REUSE, env=env) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': True, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'] } expert_rollout_params = { 'exploit': not v['noisy_expert'], 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'] } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] expert_rollout_params[name] = params[name] expert_rollout_params['noise_eps'] = v['expert_noise'] expert_rollout_params['random_eps'] = v['expert_eps'] rollout_worker = RolloutWorker([env], policy, dims, logger_b, **rollout_params) # prepare expert policy, rollout worker import joblib if v['expert_policy'] == 'planner': from sandbox.experiments.goals.maze.expert.maze_expert import MazeExpert expert_policy = MazeExpert(inner_env, step_size=0.2) else: expert_policy = joblib.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), v['expert_policy']))['policy'] expert_rollout_worker = RolloutWorker([env], expert_policy, dims, logger_b, **expert_rollout_params) input_shapes = dims_to_shapes(dims) expert_sample_transitions = config.configure_her(params_expert) buffer_shapes = {key: (v['horizon'] if key != 'o' else v['horizon'] + 1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], 2) buffer_shapes['ag'] = (v['horizon'] + 1, 2) buffer_shapes['successes'] = (v['horizon'],) expert_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], expert_sample_transitions) policy.expert_buffer = expert_buffer sample_transitions_relabel = config.configure_her(params_policy_buffer) normal_sample_transitions = policy.sample_transitions empty_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], normal_sample_transitions) if not v['query_expert'] or not 'gail' in v['mode']: for i in range(v['num_demos']): # rollout is generated by expert policy episode = expert_rollout_worker.generate_rollouts(reset=not v['no_resets']) # and is stored into the expert buffer expert_buffer.store_episode(episode) if i <= 20: path_length = np.argmax(episode['info_goal_reached'][0]) path_length = v['horizon'] - 1 if path_length == 0 else path_length plot_path(episode['o'][0][:path_length], report=report, obs=True, goal=episode['g'][0][0], limit=v['goal_range'], center=v['goal_center']) report.new_row() # TODO: what is subsampling_rate uninitialized_vars = [] for var in tf.global_variables(): try: tf.get_default_session().run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) init_new_vars_op = tf.initialize_variables(uninitialized_vars) tf.get_default_session().run(init_new_vars_op) outer_iter = 0 logger.log('Generating the Initial Heatmap...') def evaluate_performance(env): four_rooms = np.array([[-2, -2], [-13, -13]]) if v['unif_starts']: mean_rewards, successes = [], [] for pos in four_rooms: env.update_start_generator(FixedStateGenerator(np.array(pos))) mr, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], using_gym=True, noise=v['action_noise'], n_processes=8, log=False) mean_rewards.append(mr) successes.append(scs) with logger.tabular_prefix('Outer_'): logger.record_tabular('iter', outer_iter) logger.record_tabular('MeanRewards', np.mean(mean_rewards)) logger.record_tabular('Success', np.mean(successes)) else: env.update_start_generator(FixedStateGenerator(np.array([0, 0]))) _, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], using_gym=True, noise=v['action_noise'], n_processes=8) report.new_row() env.update_start_generator(uniform_start_generator) return scs logger.dump_tabular(with_prefix=False) import cloudpickle max_success = 0. if not v['query_expert'] and v['num_demos'] > 0: if not v['relabel_expert']: goals = goals_filtered = expert_buffer.buffers['g'][:v['num_demos'], 0, :] else: # collect all states visited by the expert goals = None for i in range(v['num_demos']): terminate_index = np.argmax(expert_buffer.buffers['successes'][i]) if np.logical_not(np.any(expert_buffer.buffers['successes'][i])): terminate_index = v['horizon'] cur_goals = expert_buffer.buffers['o'][i, :terminate_index, :2] if goals is None: goals = cur_goals else: goals = np.concatenate([goals, cur_goals]) goal_state_collection = StateCollection(distance_threshold=v['coll_eps']) goal_state_collection.append(goals) goals_filtered = goal_state_collection.states else: goals_filtered = goals = np.random.permutation(np.array(feasible_states))[:300] if v['agent_state_as_goal']: goals = goals else: feasible_states = sample_unif_feas(inner_env, 10) goals = np.random.permutation(np.array(feasible_states))[:300] evaluate_performance(test_env) logger.dump_tabular(with_prefix=False) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") if v['unif_goals']: env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) else: env.update_goal_generator(FixedStateGenerator(v['final_goal'])) logger.log("Training the algorithm") train(policy, discriminator, rollout_worker, v['inner_iters'], v['n_cycles'], v['n_batches'], v['n_batches_dis'], policy.buffer, expert_buffer, empty_buffer=empty_buffer if v['on_policy_dis'] else None, num_rollouts=v['num_rollouts'], reset=not v['no_resets'], feasible_states=feasible_states if v['query_expert'] else None, expert_policy=expert_policy if v['query_expert'] else None, agent_policy=policy if v['query_agent'] else None, train_dis_per_rollout=v['train_dis_per_rollout'], noise_expert=v['noise_dis_agent'], noise_agent=v['noise_dis_expert'], sample_transitions_relabel=sample_transitions_relabel if v['relabel_for_policy'] else None, q_annealing=v['q_annealing'], outer_iter=outer_iter, annealing_coeff=v['annealing_coeff']) # logger.record_tabular('NonZeroRewProp', nonzeros) logger.log('Generating the Heatmap...') success = evaluate_performance(test_env) if success > max_success: print ("% f >= %f, saving policy to params_best" % (success, max_success)) with open(osp.join(log_dir, 'params_best.pkl'), 'wb') as f: cloudpickle.dump({'env': env, 'policy': policy}, f) max_success = success report.new_row() logger.dump_tabular(with_prefix=False)
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, params=None, **kwargs): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = { # env 'max_u': 1., # max absolute value of actions on different coordinates # ddpg 'layers': 3, # number of layers in the critic/actor networks 'hidden': 256, # number of neurons in each hidden layers 'network_class': 'baselines.her.actor_critic:ActorCritic', 'Q_lr': 0.001, # critic learning rate 'pi_lr': 0.001, # actor learning rate 'buffer_size': int(1E6), # for experience replay 'polyak': 0.95, # polyak averaging coefficient 'action_l2': 1.0, # quadratic penalty on actions (before rescaling by max_u) 'clip_obs': 200., 'scope': 'ddpg', # can be tweaked for testing 'relative_goals': False, # training 'n_cycles': 50, # per epoch 'rollout_batch_size': 2, # per mpi thread 'n_batches': 40, # training batches per cycle 'batch_size': 256, # per mpi thread, measured in transitions and reduced to even multiple of chunk_length. 'n_test_rollouts': 10, # number of test rollouts per epoch, each consists of rollout_batch_size rollouts 'test_with_polyak': False, # run test episodes with the target network # exploration 'random_eps': 0.2, # percentage of time a random action is taken 'noise_eps': 0.3, # std of gaussian noise added to not-completely-random actions as a percentage of max_u # HER 'replay_strategy': 'future', # supported modes: future, none 'replay_k': 4, # number of additional goals used for replay, only used if off_policy_data=future # normalization 'norm_eps': 0.01, # epsilon used for observation normalization 'norm_clip': 5, # normalized observations are cropped to this values 'bc_loss': 0, # whether or not to use the behavior cloning loss as an auxilliary loss 'q_filter': 0, # whether or not a Q value filter should be used on the Actor outputs 'num_demo': 25, # number of expert demo episodes 'demo_batch_size': 128, #number of samples to be used from the demonstrations buffer, per mpi thread 128/1024 or 32/256 'prm_loss_weight': 0.001, #Weight corresponding to the primary loss 'aux_loss_weight': 0.0078, #Weight corresponding to the auxilliary loss also called the cloning loss 'perturb': kwargs['pert_type'], 'n_actions': kwargs['n_actions'], } params['replay_strategy'] = replay_strategy if env is not None: env_name = env.spec.id params['env_name'] = env_name if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in else: params['env_name'] = 'NuFingers_Experiment' params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) if demo_file is not None: params['bc_loss'] = 1 params['q_filter'] = 1 params['n_cycles'] = 20 params['random_eps'] = 0.1 # chip: ON params['noise_eps'] = 0.1 # chip: ON # params['batch_size']: 1024 params = config.prepare_params(params) params['rollout_batch_size'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() if env is not None: dims = config.configure_dims(params) else: dims = dict(o=15, u=4, g=7, info_is_success=1) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env print("NAME={}".format(params['env_name'])) print(rollout_params) if params['env_name'].find('NuFingers_Experiment') == -1: rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) else: rollout_worker = RolloutNuFingers(policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutNuFingers(policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size return train(save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params=None, save_policies=True): """ launch training with mpi :param env: (str) environment ID :param logdir: (str) the log directory :param n_epochs: (int) the number of training epochs :param num_cpu: (int) the number of CPUs to run on :param seed: (int) the initial random seed :param replay_strategy: (str) the type of replay strategy ('future' or 'none') :param policy_save_interval: (int) the interval with which policy pickles are saved. If set to 0, only the best and latest policy will be pickled. :param clip_return: (float): clip returns to be in [-clip_return, clip_return] :param override_params: (dict) override any parameter for training :param save_policies: (bool) whether or not to save the policies """ if override_params is None: override_params = {} # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) tf_util.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(folder=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as file_handler: json.dump(params, file_handler) params = config.prepare_params(params) config.log_params(params, logger_input=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, # 'use_demo_states': True, 'compute_q': False, 'time_horizon': params['time_horizon'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], # 'use_demo_states': False, 'compute_q': True, 'time_horizon': params['time_horizon'], } for name in [ 'time_horizon', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def prepare_agent(_env, eval_env, active, exploration='eps_greedy', action_l2=None, scope=None, ss=False, load_path=None): # Prepare params. _params = copy.deepcopy(config.DEFAULT_PARAMS) _kwargs = copy.deepcopy(kwargs) _override_params = copy.deepcopy(override_params) env_name = _env.spec.id _params['env_name'] = env_name _params['replay_strategy'] = replay_strategy _params['ss'] = ss if action_l2 is not None: _params['action_l2'] = action_l2 if not active: _params["buffer_size"] = 1 if env_name in config.DEFAULT_ENV_PARAMS: _params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in _params.update( **_override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(_params, f) _params = config.prepare_params(_params) _params['rollout_batch_size'] = _env.num_envs if demo_file is not None: _params['bc_loss'] = 1 _params.update(_kwargs) config.log_params(_params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims, coord_dict = config.configure_dims(_params) _params['ddpg_params']['scope'] = scope policy, reward_fun = config.configure_ddpg(dims=dims, params=_params, active=active, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) print(f"Loaded model: {load_path}") rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'exploration': exploration } eval_params = { 'exploit': True, 'use_target_net': _params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = _params[name] eval_params[name] = _params[name] eval_env = eval_env or _env rollout_worker = RolloutWorker(_env, policy, dims, logger, active, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, active, **eval_params) return policy, rollout_worker, evaluator, _params, coord_dict, reward_fun
def main(policy_file, seed, n_test_rollouts, render, exploit, compute_q, collect_data, goal_generation, note): set_global_seeds(seed) # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS params['note'] = note or params['note'] if note: with open('params/'+env_name+'/'+note+'.json', 'r') as file: override_params = json.loads(file.read()) params.update(**override_params) if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name goal_generation = params['goal_generation'] params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': exploit, # eval: True, train: False 'use_target_net': params['test_with_polyak'], # eval/train: False 'compute_Q': compute_q, # eval: True, train: False 'rollout_batch_size': 1, 'render': render, } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() num_skills = params['num_skills'] if goal_generation == 'Zero': generated_goal = np.zeros(evaluator.g.shape) else: generated_goal = False for z in range(num_skills): assert(evaluator.rollout_batch_size==1) z_s_onehot = np.zeros([evaluator.rollout_batch_size, num_skills]) z_s_onehot[0, z] = 1 base = os.path.splitext(policy_file)[0] for i_test_rollouts in range(n_test_rollouts): if render == 'rgb_array' or render == 'human': imgs, episode = evaluator.generate_rollouts(generated_goal=generated_goal, z_s_onehot=z_s_onehot) end = '_test_{:02d}_exploit_{}_compute_q_{}_skill_{}.avi'.format(i_test_rollouts, exploit, compute_q, z) test_filename = base + end save_video(imgs[0], test_filename, lib='cv2') else: episode = evaluator.generate_rollouts(generated_goal=generated_goal, z_s_onehot=z_s_onehot) if collect_data: end = '_test_{:02d}_exploit_{}_compute_q_{}_skill_{}.txt'.format(i_test_rollouts, exploit, compute_q, z) test_filename = base + end with open(test_filename, 'w') as file: file.write(json.dumps(episode['o'].tolist())) # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()
def launch(env_name, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, temperature, prioritization, binding, logging, version, dump_buffer, n_cycles, rank_method, fit_interval, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu, binding) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if logging: logdir = 'logs/'+str(env_name)+'-prioritization'+str(prioritization)+'-replay_strategy'+str(replay_strategy)+\ '-n_epochs'+str(n_epochs)+'-num_cpu'+str(num_cpu)+'-seed'+str(seed)+'-version'+str(version) else: logdir = osp.join( tempfile.gettempdir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy params['temperature'] = temperature params['prioritization'] = prioritization params['binding'] = binding params['max_timesteps'] = n_epochs * params['n_cycles'] * params[ 'n_batches'] * num_cpu params['version'] = version params['dump_buffer'] = dump_buffer params['n_cycles'] = n_cycles params['rank_method'] = rank_method params['fit_interval'] = fit_interval params['n_epochs'] = n_epochs params['num_cpu'] = num_cpu if params['dump_buffer']: params['alpha'] = 0 if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies, num_cpu=num_cpu, dump_buffer=dump_buffer, rank_method=rank_method, fit_interval=fit_interval, prioritization=prioritization)
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs ): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.spec.id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['rollout_batch_size'] = env.num_envs if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size return train( save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)
def launch(env, trial_id, n_epochs, num_cpu, seed, policy_save_interval, clip_return, normalize_obs, structure, task_selection, goal_selection, goal_replay, task_replay, perturb, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: save_dir = find_save_path('./save/' + env + "/", trial_id) logger.configure(dir=save_dir) else: save_dir = None # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params, add main function arguments and log all parameters if structure == 'curious' or structure == 'task_experts': params = config.MULTI_TASK_PARAMS else: params = config.DEFAULT_PARAMS time = str(datetime.datetime.now()) params['time'] = time params['env_name'] = env params['task_selection'] = task_selection params['goal_selection'] = goal_selection params['task_replay'] = task_replay params['goal_replay'] = goal_replay params['structure'] = structure params['normalize_obs'] = normalize_obs params['num_cpu'] = num_cpu params['clip_return'] = clip_return params['trial_id'] = trial_id params['seed'] = seed if rank == 0: with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['ddpg_params']['normalize_obs'] = normalize_obs if rank == 0: config.log_params(params, logger=logger) if num_cpu != 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Colas et al. (2018, https://arxiv.org/abs/1810.06284) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) buffers = config.configure_buffer(dims=dims, params=params) # creates several policies with shared buffers in the task-experts structure, otherwise use just one policy if structure == 'task_experts': policy = [config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return, t_id=i) for i in range(params['nb_tasks'])] else: policy = config.configure_ddpg(dims=dims, params=params, buffers=buffers, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'structure': structure, 'task_selection': task_selection, 'goal_selection': goal_selection, 'queue_length': params['queue_length'], 'eval': False, 'eps_task': params['eps_task'] } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], 'structure' : structure, 'task_selection': task_selection, 'goal_selection' : goal_selection, 'queue_length': params['queue_length'], 'eval': True, } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] if structure == 'task_experts': # create one rollout worker per policy/task rollout_worker = [RolloutWorker(params['make_env'], policy[i], dims, logger, unique_task=i, **rollout_params) for i in range(params['nb_tasks'])] for i in range(params['nb_tasks']): rollout_worker[i].seed(rank_seed + i) else: rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed + 100) train(logdir=save_dir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], perturbation_study=perturb, policy_save_interval=policy_save_interval, save_policies=save_policies, structure=structure, task_selection=task_selection, params=params)
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.') logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging # if rank == 0: # if logdir or logger.get_dir() is None: # logger.configure(dir=logdir) # else: # logger.configure() # logdir = logger.get_dir() # assert logdir is not None # os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter # with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: # json.dump(params, f) # {'Q_lr': 0.001, 'action_l2': 1.0, 'batch_size': 256, 'buffer_size': 1000000, 'clip_obs': 200.0, # 'env_name': u'GazeboSmartBotPinc...Kinect-v0', 'hidden': 256, 'layers': 3, 'max_u': 1.0, 'n_batches': 40, # 'n_cycles': 50, 'n_test_rollouts': 10, 'network_class': 'baselines.her.actor...torCritic', 'noise_eps': 0.2, ...} params = config.prepare_params( params ) # {'T': 200, '_Q_lr': 0.001, '_action_l2': 1.0, '_batch_size': 256, '_buffer_size': 1000000, '_clip_obs': 200.0, '_hidden': 256, '_layers': 3, '_max_u': 1.0, '_network_class': 'baselines.her.actor...torCritic', '_norm_clip': 5, '_norm_eps': 0.01, '_pi_lr': 0.001, '_polyak': 0.95, ...} config.log_params(params, logger=logger) # T: 200 # _Q_lr: 0.001 # _action_l2: 1.0 # _batch_size: 256 # _buffer_size: 1000000 # _clip_obs: 200.0 # _hidden: 256 # _layers: 3 # _max_u: 1.0 # _network_class: baselines.her.actor_critic:ActorCritic # _norm_clip: 5 # _norm_eps: 0.01 # _pi_lr: 0.001 # _polyak: 0.95 # _relative_goals: False # _scope: ddpg # ddpg_params: {'layers': 3, 'norm_clip': 5, 'action_l2': 1.0, 'pi_lr': 0.001, 'norm_eps': 0.01, 'batch_size': 256, 'polyak': 0.95, 'clip_obs': 200.0, 'network_class': 'baselines.her.actor_critic:ActorCritic', 'max_u': 1.0, 'Q_lr': 0.001, 'scope': 'ddpg', 'buffer_size': 1000000, 'hidden': 256, 'relative_goals': False} # env_name: GazeboSmartBotPincherKinect-v0 # gamma: 0.995 # make_env: <function make_env at 0x7f7de3cb2050> # n_batches: 40 # n_cycles: 50 # n_test_rollouts: 10 # noise_eps: 0.2 # random_eps: 0.3 # replay_k: 4 # replay_strategy: future # rollout_batch_size: 2 # test_with_polyak: False # if num_cpu == 1: # logger.warn() # logger.warn('*** Warning ***') # logger.warn( # 'You are running HER with just a single MPI worker. This will work, but the ' + # 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + # 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + # 'are looking to reproduce those results, be aware of this. Please also refer to ' + # 'https://github.com/openai/baselines/issues/314 for further details.') # logger.warn('****************') # logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def learn(*, network, env, total_timesteps, seed=None, eval_env=None, replay_strategy='future', policy_save_interval=5, clip_return=True, demo_file=None, override_params=None, load_path=None, save_path=None, **kwargs): override_params = override_params or {} if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() num_cpu = MPI.COMM_WORLD.Get_size() # Seed everything. rank_seed = seed + 1000000 * rank if seed is not None else None set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS env_name = env.specs[0].id params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name] ) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) params['rollout_batch_size'] = env.num_envs if demo_file is not None: params['bc_loss'] = 1 params.update(kwargs) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) if load_path is not None: tf_util.load_variables(load_path) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] eval_env = eval_env or env rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params) evaluator = RolloutWorker(eval_env, policy, dims, logger, **eval_params) n_cycles = params['n_cycles'] n_epochs = total_timesteps // n_cycles // rollout_worker.T // rollout_worker.rollout_batch_size return train(save_path=save_path, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, demo_file=demo_file)
def main(policy_file, seed, n_test_rollouts, render, fname): set_global_seeds(seed) percent = 0.001 obs_npy, achieved_goal_npy, goal_npy, action_npy, reward_npy, info_npy = [], [], [], [], [], [] # Load policy. with open(policy_file, 'rb') as f: policy = pickle.load(f) env_name = policy.info['env_name'] # Prepare params. params = config.DEFAULT_PARAMS if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params['env_name'] = env_name params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'compute_Q': True, 'rollout_batch_size': 1, 'render': bool(render), } for name in ['T', 'gamma', 'noise_eps', 'random_eps']: eval_params[name] = params[name] evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(seed) # Run evaluation. evaluator.clear_history() print("Progress Rate ::") #print("*") for i in tqdm(range(n_test_rollouts)): episode = evaluator.generate_rollouts() obs, goals, achieved_goals, acts, reward_arr, successes = clean_data(episode) obs_npy.append(np.array(obs)) goal_npy.append(np.array(goals)) achieved_goal_npy.append(np.array(achieved_goals)) action_npy.append(np.array(acts)) reward_npy.append(np.array(reward_arr)) info_npy.append(np.array(successes)) #print(int(n_test_rollouts * percent)) #print(i) if int(i) == int(n_test_rollouts * percent): #print("*") percent += 0.001 # Saving in a file np.savez(fname, obs= obs_npy, g = goal_npy, ag=achieved_goal_npy, acts = action_npy, rew =reward_npy, info = info_npy) # record logs for key, val in evaluator.logs('test'): logger.record_tabular(key, np.mean(val)) logger.dump_tabular()