def main(config): reward_baseline = LinearTimeBaseline() return_baseline = LinearFeatureBaseline() env = normalize(HalfCheetahRandDirecEnv()) policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = DiceMAMLSampleProcessor( baseline=reward_baseline, max_path_length=config['max_path_length'], discount=config['discount'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], return_baseline=return_baseline ) algo = VPG_DICEMAML( policy=policy, max_path_length=config['max_path_length'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], inner_lr=config['inner_lr'], learning_rate=config['learning_rate'] ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], # This is repeated in MAMLPPO, it's confusing ) trainer.train()
def main(config): baseline = LinearFeatureBaseline() env = normalize(HopperRandParamsEnv()) obs_dim = np.prod(env.observation_space.shape) policy = GaussianMLPPolicy( name="meta-policy", obs_dim=obs_dim, action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], ) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=config[ 'rollouts_per_meta_task'], # This batch_size is confusing meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=5, ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPO(policy=policy, learning_rate=config['learning_rate'], max_epochs=config['max_epochs']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) Qs = [ValueFunction(name="q_fun_%d" % i, obs_dim=int(np.prod(env.observation_space.shape)), action_dim=int(np.prod(env.action_space.shape)) ) for i in range(2)] Q_targets = [ValueFunction(name="q_fun_target_%d" % i, obs_dim=int(np.prod(env.observation_space.shape)), action_dim=int(np.prod(env.action_space.shape)) ) for i in range(2)] policy = TanhGaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = SAC( policy=policy, discount=kwargs['discount'], learning_rate=kwargs['learning_rate'], env=env, Qs=Qs, Q_targets=Q_targets, reward_scale=kwargs['reward_scale'] ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], sess=sess, ) trainer.train() sess.__exit__()
def run_experiment(**kwargs): exp_dir = os.getcwd( ) + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) policy = GaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], init_std=kwargs['init_std'], ) # Load policy here sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = SingleSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], sess=sess, ) trainer.train()
def run_experiment(**config): set_seed(config['seed']) original_saved_path = config['saved_path'] if original_saved_path is not None: saved_model = joblib.load(config['saved_path']) if 'config' in saved_model: if not config['override_old_config']: config = saved_model['config'] arguments = { "start_loc": 'all', "include_holdout_obj": False, "persist_goal": config['persist_goal'], "persist_objs": config['persist_objs'], "persist_agent": config['persist_agent'], "feedback_type": config["feedback_type"], "feedback_always": config["feedback_always"], "feedback_freq": config["feedback_freq"], "cartesian_steps": config["cartesian_steps"], "num_meta_tasks": config["rollouts_per_meta_task"], "intermediate_reward": config["intermediate_reward"], } advice_start_index = 160 if original_saved_path is not None: set_seed(config['seed']) policy = saved_model['policy'] optimizer = saved_model['optimizer'] policy.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # TODO: is this necessary? policy.hidden_state = None baseline = saved_model['baseline'] curriculum_step = saved_model['curriculum_step'] env = rl2env(normalize( Curriculum(config['advance_curriculum_func'], start_index=curriculum_step, **arguments)), ceil_reward=config['ceil_reward']) start_itr = saved_model['itr'] reward_predictor = saved_model['reward_predictor'] reward_predictor.hidden_state = None if 'supervised_model' in saved_model: supervised_model = saved_model['supervised_model'] else: supervised_model = None teacher_train_dict = {} for teacher_name in config['feedback_type']: teacher_train_dict[teacher_name] = True else: teacher_train_dict = {} for teacher_name in config['feedback_type']: teacher_train_dict[teacher_name] = True optimizer = None baseline = None env = rl2env(normalize( Curriculum(config['advance_curriculum_func'], start_index=config['level'], **arguments)), ceil_reward=config['ceil_reward']) obs = env.reset() obs_dim = 100 # TODO: consider changing this with 'additional' and adding it! advice_size = sum( [np.prod(obs[adv_k].shape) for adv_k in teacher_train_dict.keys()]) image_dim = 128 memory_dim = config['memory_dim'] instr_dim = config['instr_dim'] use_instr = True instr_arch = 'bigru' use_mem = True arch = 'bow_endpool_res' advice_dim = 128 # TODO: move this to the config policy = ACModel(obs_space=obs_dim, action_space=env.action_space, env=env, image_dim=image_dim, memory_dim=memory_dim, instr_dim=instr_dim, lang_model=instr_arch, use_instr=use_instr, use_memory=use_mem, arch=arch, advice_dim=advice_dim, advice_size=advice_size, num_modules=config['num_modules']) reward_predictor = ACModel( obs_space=obs_dim - 1, # TODO: change into Discrete(3) and do 3-way classification action_space=spaces.Discrete(2), env=env, image_dim=image_dim, memory_dim=memory_dim, instr_dim=instr_dim, lang_model=instr_arch, use_instr=use_instr, use_memory=use_mem, arch=arch, advice_dim=advice_dim, advice_size=advice_size, num_modules=config['num_modules']) if config['self_distill'] and not config['distill_same_model']: obs_dim = env.reset()['obs'].shape[0] image_dim = 128 memory_dim = config['memory_dim'] instr_dim = config['instr_dim'] use_instr = True instr_arch = 'bigru' use_mem = True arch = 'bow_endpool_res' supervised_model = ACModel(obs_space=obs_dim - 1, action_space=env.action_space, env=env, image_dim=image_dim, memory_dim=memory_dim, instr_dim=instr_dim, lang_model=instr_arch, use_instr=use_instr, use_memory=use_mem, arch=arch, advice_dim=advice_dim, advice_size=advice_size, num_modules=config['num_modules']) elif config['self_distill']: supervised_model = policy else: supervised_model = None start_itr = 0 curriculum_step = env.index parser = ArgumentParser() args = parser.parse_args([]) args.entropy_coef = config['entropy_bonus'] args.model = 'default_il' args.lr = config['learning_rate'] args.recurrence = config['backprop_steps'] args.clip_eps = config['clip_eps'] if supervised_model is not None: il_trainer = ImitationLearning( supervised_model, env, args, distill_with_teacher=config['distill_with_teacher']) else: il_trainer = None rp_trainer = ImitationLearning(reward_predictor, env, args, distill_with_teacher=True, reward_predictor=True) teacher_null_dict = env.teacher.null_feedback() obs_preprocessor = make_obs_preprocessor(teacher_null_dict) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=1, reward_predictor=reward_predictor, supervised_model=supervised_model, obs_preprocessor=obs_preprocessor, ) sample_processor = RL2SampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) envs = [ copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), copy.deepcopy(env), ] algo = PPOAlgo(policy, envs, config['frames_per_proc'], config['discount'], args.lr, args.beta1, args.beta2, config['gae_lambda'], args.entropy_coef, config['value_loss_coef'], config['max_grad_norm'], args.recurrence, args.optim_eps, config['clip_eps'], config['epochs'], config['meta_batch_size'], parallel=config['parallel'], rollouts_per_meta_task=config['rollouts_per_meta_task'], obs_preprocessor=obs_preprocessor) if optimizer is not None: algo.optimizer.load_state_dict(optimizer) EXP_NAME = get_exp_name(config) exp_dir = os.getcwd() + '/data/' + EXP_NAME + "_" + str(config['seed']) if original_saved_path is None: if os.path.isdir(exp_dir): shutil.rmtree(exp_dir) log_formats = ['stdout', 'log', 'csv'] is_debug = config['prefix'] == 'DEBUG' if not is_debug: log_formats.append('tensorboard') log_formats.append('wandb') logger.configure(dir=exp_dir, format_strs=log_formats, snapshot_mode=config['save_option'], snapshot_gap=50, step=start_itr, name=config['prefix'] + str(config['seed']), config=config) json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) advice_end_index, advice_dim = 161, 1 if config[ 'distill_with_teacher']: # TODO: generalize this for multiple feedback types at once! teacher_info = [] else: null_val = np.zeros(advice_end_index - advice_start_index) if len(null_val) > 0: null_val[-1] = 1 teacher_info = [{ "indices": np.arange(advice_start_index, advice_end_index), "null": null_val }] trainer = Trainer( algo=algo, policy=policy, env=deepcopy(env), sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], start_itr=start_itr, success_threshold=config['success_threshold'], accuracy_threshold=config['accuracy_threshold'], exp_name=exp_dir, curriculum_step=curriculum_step, config=config, advance_without_teacher=True, teacher_info=teacher_info, sparse_rewards=not config['intermediate_reward'], distill_only=config['distill_only'], il_trainer=il_trainer, source=config['source'], batch_size=config['meta_batch_size'], train_with_teacher=config['feedback_type'] is not None, distill_with_teacher=config['distill_with_teacher'], supervised_model=supervised_model, reward_predictor=reward_predictor, rp_trainer=rp_trainer, advance_levels=config['advance_levels'], is_debug=is_debug, teacher_train_dict=teacher_train_dict, obs_preprocessor=obs_preprocessor, ) trainer.train()
def run_experiment(**kwargs): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) env = normalize(kwargs['env']()) # Wrappers? baseline = NNValueFun( 'value-function', env, hidden_nonlinearity=kwargs['vfun_hidden_nonlinearity'], hidden_sizes=kwargs['vfun_hidden_sizes'], output_nonlinearity=kwargs['vfun_output_nonlinearity'], learning_rate=kwargs['vfun_learning_rate'], batch_size=kwargs['vfun_batch_size'], buffer_size=kwargs['vfun_buffer_size'], normalize_input=False, ) policy = GaussianMLPPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsModel( 'prob-dynamics', env=env, hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], normalize_input=False, ) assert kwargs['num_rollouts'] % kwargs['n_parallel'] == 0 sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = SVG1( policy=policy, dynamics_model=dynamics_model, value_function=baseline, tf_reward=env.tf_reward, learning_rate=kwargs['svg_learning_rate'], num_grad_steps=kwargs['num_rollouts'] * kwargs['max_path_length'] // kwargs['svg_batch_size'], batch_size=kwargs['svg_batch_size'], discount=kwargs['discount'], kl_penalty=kwargs['kl_penalty'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, dynamics_model=dynamics_model, value_function=baseline, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], vfun_max_epochs=kwargs['vfun_max_epochs'], sess=sess, ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['n_parallel'], ) model_sampler = METRPOSampler( env=env, policy=policy, num_rollouts=kwargs['imagined_num_rollouts'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = SampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = PPO( policy=policy, learning_rate=kwargs['learning_rate'], clip_eps=kwargs['clip_eps'], max_epochs=kwargs['num_ppo_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], sample_from_buffer=True, sess=sess, ) trainer.train()
env_class = getattr(iclr19_levels, args.class_name) # env_args = { # 'start_loc': args.start_loc, # 'include_holdout_obj': args.holdout_obj, # } # if args.grid_size is not None: # env_args['room_size'] = args.grid_size # if args.num_dists is not None: # env_args['num_dists'] = args.num_dists # e_new = env_class(**env_args) e_new = env_class(**arguments) e_new.use_teacher = args.use_teacher if args.use_teacher: teacher = PostActionAdvice(Bot, e_new) e_new.teacher = teacher e_new.teacher.set_feedback_type(args.feedback_type) env = rl2env(normalize(e_new)) video_filename = os.path.join(args.path, 'saved_video.mp4') paths, accuracy = rollout(env, policy, max_path_length=max_path_length, animated=args.animated, speedup=args.speedup, video_filename=video_filename, save_video=True, ignore_done=args.ignore_done, batch_size=1, stochastic=args.stochastic, num_rollouts=args.num_rollouts, reset_every=args.reset_every, record_teacher=True, reward_predictor=reward_predictor, dense_rewards=args.dense_rewards) print('Average Returns: ', np.mean([sum(path['rewards']) for path in paths])) print('Average Path Length: ', np.mean([path['env_infos'][-1]['episode_length'] for path in paths])) print('Average Success Rate: ', np.mean([path['env_infos'][-1]['success'] for path in paths])) tf.reset_default_graph()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() if not kwargs['use_images']: env = normalize(kwargs['env']()) else: vae = VAE(latent_dim=8) env = image_wrapper(normalize(kwargs['env']()), vae=vae, latent_dim=32) policy = NNPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], normalization=None, ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) # dynamics_model = None assert kwargs['rollouts_per_policy'] % kwargs['num_models'] == 0 env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], n_parallel=kwargs['num_rollouts'], ) # TODO: I'm not sure if it works with more than one rollout per model model_sampler = ARSSampler( env=env, policy=policy, dynamics_model=dynamics_model, rollouts_per_policy=kwargs['rollouts_per_policy'], max_path_length=kwargs['horizon'], num_deltas=kwargs['num_deltas'], n_parallel=1, ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) ars_sample_processor = ARSSamplerProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], uncertainty_coeff=kwargs['uncertainty_coeff']) algo = RandomSearchOptimizer(policy=policy, learning_rate=kwargs['learning_rate'], num_deltas=kwargs['num_deltas'], percentile=kwargs['percentile']) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, ars_sample_processor=ars_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, num_deltas=kwargs['num_deltas'], n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], delta_std=kwargs['delta_std'], sess=sess, initial_random_samples=True, sample_from_buffer=kwargs['sample_from_buffer']) trainer.train()
def run_experiment(**config): exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + config.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) env = normalize(config['env']()) if config['recurrent']: dynamics_model = RNNDynamicsEnsemble( name="dyn_model", env=env, hidden_sizes=config['hidden_sizes_model'], learning_rate=config['learning_rate'], backprop_steps=config['backprop_steps'], cell_type=config['cell_type'], num_models=config['num_models'], batch_size=config['batch_size_model'], normalize_input=True, ) policy = RNNMPCController(name="policy", env=env, dynamics_model=dynamics_model, discount=config['discount'], n_candidates=config['n_candidates'], horizon=config['horizon'], use_cem=config['use_cem'], num_cem_iters=config['num_cem_iters'], use_reward_model=config['use_reward_model']) else: dynamics_model = MLPDynamicsEnsemble( name="dyn_model", env=env, learning_rate=config['learning_rate'], hidden_sizes=config['hidden_sizes_model'], weight_normalization=config['weight_normalization_model'], num_models=config['num_models'], valid_split_ratio=config['valid_split_ratio'], rolling_average_persitency=config['rolling_average_persitency'], hidden_nonlinearity=config['hidden_nonlinearity_model'], batch_size=config['batch_size_model'], ) policy = MPCController(name="policy", env=env, dynamics_model=dynamics_model, discount=config['discount'], n_candidates=config['n_candidates'], horizon=config['horizon'], use_cem=config['use_cem'], num_cem_iters=config['num_cem_iters']) sampler = BaseSampler( env=env, policy=policy, num_rollouts=config['num_rollouts'], max_path_length=config['max_path_length'], ) sample_processor = ModelSampleProcessor() algo = Trainer( env=env, policy=policy, dynamics_model=dynamics_model, sampler=sampler, dynamics_sample_processor=sample_processor, n_itr=config['n_itr'], initial_random_samples=config['initial_random_samples'], dynamics_model_max_epochs=config['dynamic_model_epochs'], ) algo.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], sess=sess, ) trainer.train()
def run_experiment(**config): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(config['seed']) baseline = config['baseline']() env = normalize(config['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], learn_std=config['learn_std'], hidden_nonlinearity=config['hidden_nonlinearity'], output_nonlinearity=config['output_nonlinearity'], ) # Load policy here sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPOMAML( policy=policy, inner_lr=config['inner_lr'], meta_batch_size=config['meta_batch_size'], num_inner_grad_steps=config['num_inner_grad_steps'], learning_rate=config['learning_rate'], num_ppo_steps=config['num_ppo_steps'], num_minibatches=config['num_minibatches'], clip_eps=config['clip_eps'], clip_outer=config['clip_outer'], target_outer_step=config['target_outer_step'], target_inner_step=config['target_inner_step'], init_outer_kl_penalty=config['init_outer_kl_penalty'], init_inner_kl_penalty=config['init_inner_kl_penalty'], adaptive_outer_kl_penalty=config['adaptive_outer_kl_penalty'], adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], anneal_factor=config['anneal_factor'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], num_inner_grad_steps=config['num_inner_grad_steps'], ) trainer.train()
def run_experiment(**kwargs): num = Num() exp_name = EXP_NAME + str(num.EXP_NUM) exp_dir = os.getcwd() + '/data/video_peg/' + EXP_NAME + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['csv', 'stdout', 'log'], snapshot_mode='all') #change to all json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) Num.EXP_NUM += 1 with sess.as_default() as sess: # Instantiate classesLogger set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = BaseSampler( env=env, policy=policy, # rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'], num_rollouts=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], sleep_reset=2.5, #parallel=kwargs['parallel'], # parallel=False ) model_sampler = MBMPOSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, deterministic=kwargs['deterministic'], ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], meta_steps_per_iter=kwargs['meta_steps_per_iter'], sample_from_buffer=True, sess=sess, ) trainer.train()
parser.add_argument( '--stochastic', action='store_true', help='Apply stochastic action instead of deterministic') args = parser.parse_args() # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.Session(): # [rest of the code] with tf.Session() as sess: pkl_path = args.param print("Testing policy %s" % pkl_path) data = joblib.load(pkl_path) policy = data['policy'] env = normalize(BlueReacherEnv(side='right')) goal = data['env'].goal_right env.goal[0] = -goal[1] env.goal[1] = goal[2] env.goal[2] = goal[0] real_rewards = np.array([]) act_rewards = np.array([]) pos_rewards = np.array([]) mujoco_env_mimic_act = data['env'] for _ in range(args.num_rollouts): path = rollout(env, policy, max_path_length=args.max_path_length, animated=False, speedup=args.speedup, video_filename=args.video_filename,
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME + '/' + kwargs.get( 'exp_name', '') logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) sess = tf.Session(config=config) with sess.as_default() as sess: # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() if not kwargs['use_images']: env = normalize(kwargs['env'](policytask=kwargs['task'])) vae = None else: vae = VAE(latent_dim=kwargs['latent_dim'], channels=3 * kwargs['time_steps']) env = image_wrapper(normalize(kwargs['env']()), latent_dim=kwargs['latent_dim'], time_steps=kwargs['time_steps']) policy = NNPolicy( name="policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['hidden_sizes'], normalization=kwargs['normalization'], ) env_sampler = Sampler( env=env, policy=policy, num_rollouts=kwargs['num_rollouts'], max_path_length=kwargs['max_path_length'], vae=vae, ) model_sampler = ARSSampler( env=env, policy=policy, rollouts_per_policy=kwargs['rollouts_per_policy'], max_path_length=kwargs['max_path_length'], num_deltas=kwargs['num_deltas'], n_parallel=kwargs['num_deltas'], vae=vae, ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) ars_sample_processor = ARSSamplerProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = RandomSearchOptimizer(policy=policy, learning_rate=kwargs['learning_rate'], num_deltas=kwargs['num_deltas'], percentile=kwargs['percentile']) trainer = Trainer(algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, ars_sample_processor=ars_sample_processor, dynamics_sample_processor=dynamics_sample_processor, num_deltas=kwargs['num_deltas'], n_itr=kwargs['n_itr'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], delta_std=kwargs['delta_std'], sess=sess) trainer.train()
def run_experiment(**kwargs): print() exp_dir = os.getcwd( ) + '/data/parallel_mb_ppo/' + EXP_NAME + '/' + kwargs.get('exp_name', '') print("\n---------- experiment with dir {} ---------------------------". format(exp_dir)) logger.configure(dir=exp_dir, format_strs=['csv', 'stdout', 'log'], snapshot_mode='last') json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) config = ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get( 'gpu_frac', 0.95) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) '''-------- dumps and reloads -----------------''' baseline_pickle = pickle.dumps(baseline) env_pickle = pickle.dumps(env) receiver, sender = Pipe() p = Process( target=init_vars, name="init_vars", args=(sender, config, policy, dynamics_model), daemon=False, ) p.start() policy_pickle, dynamics_model_pickle = receiver.recv() receiver.close() '''-------- following classes depend on baseline, env, policy, dynamics_model -----------''' worker_data_feed_dict = { 'env_sampler': { 'num_rollouts': kwargs['num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'n_parallel': kwargs['n_parallel'], }, 'dynamics_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, } worker_model_feed_dict = {} worker_policy_feed_dict = { 'model_sampler': { 'num_rollouts': kwargs['imagined_num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'dynamics_model': dynamics_model, 'deterministic': kwargs['deterministic'], }, 'model_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, 'algo': { 'learning_rate': kwargs['learning_rate'], 'clip_eps': kwargs['clip_eps'], 'max_epochs': kwargs['num_ppo_steps'], } } trainer = ParallelTrainer( policy_pickle=policy_pickle, env_pickle=env_pickle, baseline_pickle=baseline_pickle, dynamics_model_pickle=dynamics_model_pickle, feed_dicts=[ worker_data_feed_dict, worker_model_feed_dict, worker_policy_feed_dict ], n_itr=kwargs['n_itr'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], steps_per_iter=kwargs['steps_per_iter'], flags_need_query=kwargs['flags_need_query'], config=config, simulation_sleep=kwargs['simulation_sleep'], ) trainer.train()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--policy', type=str, default=None, help='policy to load') args = parser.parse_args(sys.argv[1:]) sess = tf.InteractiveSession() policy = joblib.load(args.policy)['policy'] policy.switch_to_pre_update() baseline = LinearFeatureBaseline() env = normalize(AntRandGoalEnv()) sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=BATCH_SIZE, meta_batch_size=META_BATCH_SIZE, max_path_length=PATH_LENGTH, parallel=True, envs_per_task=20, ) sample_processor = MAMLSampleProcessor( baseline=baseline, discount=0.99, gae_lambda=1,
def run_experiment(**config): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(config, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) set_seed(config['seed']) config_sess = tf.ConfigProto() config_sess.gpu_options.allow_growth = True config_sess.gpu_options.per_process_gpu_memory_fraction = config.get('gpu_frac', 0.95) sess = tf.Session(config=config_sess) with sess.as_default() as sess: baseline = config['baseline']() #timeskip = config['timeskip'] # log_rand = config['log_rand'] # env = rl2env(normalize(config['env'](log_rand=log_rand)))#timeskip=timeskip))) env = rl2env(normalize(HalfCheetahRandVelEnv())) obs_dim = np.prod(env.observation_space.shape) + np.prod(env.action_space.shape) + 1 + 1 # obs + act + rew + done policy = GaussianRNNPolicy( name="meta-policy", obs_dim=obs_dim, action_dim=np.prod(env.action_space.shape), meta_batch_size=config['meta_batch_size'], hidden_sizes=config['hidden_sizes'], cell_type=config['cell_type'] ) sampler = MetaSampler( env=env, policy=policy, rollouts_per_meta_task=config['rollouts_per_meta_task'], meta_batch_size=config['meta_batch_size'], max_path_length=config['max_path_length'], parallel=config['parallel'], envs_per_task=1, ) sample_processor = RL2SampleProcessor( baseline=baseline, discount=config['discount'], gae_lambda=config['gae_lambda'], normalize_adv=config['normalize_adv'], positive_adv=config['positive_adv'], ) algo = PPO( policy=policy, learning_rate=config['learning_rate'], max_epochs=config['max_epochs'], backprop_steps=config['backprop_steps'], ) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=config['n_itr'], sess=sess, ) trainer.train()
parser.add_argument( '--stochastic', action='store_true', help='Apply stochastic action instead of deterministic') args = parser.parse_args() # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.Session(): # [rest of the code] with tf.Session() as sess: pkl_path = args.param print("Testing policy %s" % pkl_path) data = joblib.load(pkl_path) policy = data['policy'] env = normalize(ArmReacherEnv(side='right')) goal = data['env'].goal real_rewards = np.array([]) act_rewards = np.array([]) pos_rewards = np.array([]) for i in range(args.num_rollouts): path = rollout(env, policy, max_path_length=args.max_path_length, animated=False, speedup=args.speedup, video_filename=args.video_filename, save_video=False, ignore_done=args.ignore_done,
parser.add_argument( '--stochastic', action='store_true', help='Apply stochastic action instead of deterministic') args = parser.parse_args() # If the snapshot file use tensorflow, do: # import tensorflow as tf # with tf.Session(): # [rest of the code] with tf.Session() as sess: pkl_path = args.param print("Testing policy %s" % pkl_path) data = joblib.load(pkl_path) policy = data['policy'] env = normalize(ArmReacherEnv(side='right')) real_rewards = np.array([]) act_rewards = np.array([]) pos_rewards = np.array([]) for _ in range(args.num_rollouts): path = rollout(env, policy, max_path_length=args.max_path_length, animated=False, speedup=args.speedup, video_filename=args.video_filename, save_video=False, ignore_done=args.ignore_done, stochastic=args.stochastic)
def run_base(exp_dir, **kwargs): config = ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = kwargs.get('gpu_frac', 0.95) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() if kwargs['env'] == 'Ant': env = normalize(AntEnv()) simulation_sleep = 0.05 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] elif kwargs['env'] == 'HalfCheetah': env = normalize(HalfCheetahEnv()) simulation_sleep = 0.05 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] elif kwargs['env'] == 'Hopper': env = normalize(HopperEnv()) simulation_sleep = 0.008 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] elif kwargs['env'] == 'Walker2d': env = normalize(Walker2dEnv()) simulation_sleep = 0.008 * kwargs['num_rollouts'] * kwargs['max_path_length'] * kwargs['simulation_sleep_frac'] else: raise NotImplementedError policy = GaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble( 'dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], rolling_average_persitency=kwargs['rolling_average_persitency'], ) '''-------- dumps and reloads -----------------''' baseline_pickle = pickle.dumps(baseline) env_pickle = pickle.dumps(env) receiver, sender = Pipe() p = Process( target=init_vars, name="init_vars", args=(sender, config, policy, dynamics_model), daemon=True, ) p.start() policy_pickle, dynamics_model_pickle = receiver.recv() receiver.close() '''-------- following classes depend on baseline, env, policy, dynamics_model -----------''' worker_data_feed_dict = { 'env_sampler': { 'num_rollouts': kwargs['num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'n_parallel': kwargs['n_parallel'], }, 'dynamics_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, } worker_model_feed_dict = {} worker_policy_feed_dict = { 'model_sampler': { 'num_rollouts': kwargs['imagined_num_rollouts'], 'max_path_length': kwargs['max_path_length'], 'deterministic': kwargs['deterministic'], }, 'model_sample_processor': { 'discount': kwargs['discount'], 'gae_lambda': kwargs['gae_lambda'], 'normalize_adv': kwargs['normalize_adv'], 'positive_adv': kwargs['positive_adv'], }, 'algo': { 'learning_rate': kwargs['learning_rate'], 'clip_eps': kwargs['clip_eps'], 'max_epochs': kwargs['num_ppo_steps'], } } trainer = ParallelTrainer( exp_dir=exp_dir, algo_str=kwargs['algo'], policy_pickle=policy_pickle, env_pickle=env_pickle, baseline_pickle=baseline_pickle, dynamics_model_pickle=dynamics_model_pickle, feed_dicts=[worker_data_feed_dict, worker_model_feed_dict, worker_policy_feed_dict], n_itr=kwargs['n_itr'], flags_need_query=kwargs['flags_need_query'], config=config, simulation_sleep=simulation_sleep, sampler_str=kwargs['sampler'], ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) reward_baseline = LinearTimeBaseline() return_baseline = LinearFeatureBaseline() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), # Todo...? action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['hidden_sizes'], learn_std=kwargs['learn_std'], hidden_nonlinearity=kwargs['hidden_nonlinearity'], output_nonlinearity=kwargs['output_nonlinearity'], ) # Load policy here sampler = MAMLSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], envs_per_task=int(kwargs['rollouts_per_meta_task'] / 2)) sample_processor = DiceMAMLSampleProcessor( baseline=reward_baseline, max_path_length=kwargs['max_path_length'], discount=kwargs['discount'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], return_baseline=return_baseline) algo = VPG_DICEMAML(policy=policy, max_path_length=kwargs['max_path_length'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], inner_lr=kwargs['inner_lr'], learning_rate=kwargs['learning_rate']) trainer = Trainer( algo=algo, policy=policy, env=env, sampler=sampler, sample_processor=sample_processor, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], ) trainer.train()
def run_experiment(**kwargs): exp_dir = os.getcwd() + '/data/' + EXP_NAME logger.configure(dir=exp_dir, format_strs=['stdout', 'log', 'csv'], snapshot_mode='last_gap', snapshot_gap=50) json.dump(kwargs, open(exp_dir + '/params.json', 'w'), indent=2, sort_keys=True, cls=ClassEncoder) # Instantiate classes set_seed(kwargs['seed']) baseline = kwargs['baseline']() env = normalize(kwargs['env']()) # Wrappers? policy = MetaGaussianMLPPolicy( name="meta-policy", obs_dim=np.prod(env.observation_space.shape), action_dim=np.prod(env.action_space.shape), meta_batch_size=kwargs['meta_batch_size'], hidden_sizes=kwargs['policy_hidden_sizes'], learn_std=kwargs['policy_learn_std'], hidden_nonlinearity=kwargs['policy_hidden_nonlinearity'], output_nonlinearity=kwargs['policy_output_nonlinearity'], ) dynamics_model = MLPDynamicsEnsemble('dynamics-ensemble', env=env, num_models=kwargs['num_models'], hidden_nonlinearity=kwargs['dyanmics_hidden_nonlinearity'], hidden_sizes=kwargs['dynamics_hidden_sizes'], output_nonlinearity=kwargs['dyanmics_output_nonlinearity'], learning_rate=kwargs['dynamics_learning_rate'], batch_size=kwargs['dynamics_batch_size'], buffer_size=kwargs['dynamics_buffer_size'], ) env_sampler = SingleMetaSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['real_env_rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], parallel=kwargs['parallel'], ) model_sampler = MBMPOSampler( env=env, policy=policy, rollouts_per_meta_task=kwargs['rollouts_per_meta_task'], meta_batch_size=kwargs['meta_batch_size'], max_path_length=kwargs['max_path_length'], dynamics_model=dynamics_model, ) dynamics_sample_processor = ModelSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) model_sample_processor = MAMLSampleProcessor( baseline=baseline, discount=kwargs['discount'], gae_lambda=kwargs['gae_lambda'], normalize_adv=kwargs['normalize_adv'], positive_adv=kwargs['positive_adv'], ) algo = TRPOMAML( policy=policy, step_size=kwargs['step_size'], inner_type=kwargs['inner_type'], inner_lr=kwargs['inner_lr'], meta_batch_size=kwargs['meta_batch_size'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], exploration=kwargs['exploration'], ) trainer = Trainer( algo=algo, policy=policy, env=env, model_sampler=model_sampler, env_sampler=env_sampler, model_sample_processor=model_sample_processor, dynamics_sample_processor=dynamics_sample_processor, dynamics_model=dynamics_model, n_itr=kwargs['n_itr'], num_inner_grad_steps=kwargs['num_inner_grad_steps'], dynamics_model_max_epochs=kwargs['dynamics_max_epochs'], log_real_performance=kwargs['log_real_performance'], meta_steps_per_iter=kwargs['meta_steps_per_iter'], initial_random_samples=True, sample_from_buffer=True, ) trainer.train()