def experiment(log_dir, variant_overwrite, cpu=False): if not cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) # Load experiment from file. env, _, data, variant = load_experiment(log_dir, variant_overwrite) assert all([ a == b for a, b in zip(env.sampled_goal, variant['env_kwargs']['goal_prior']) ]) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format( variant['algo_kwargs']['num_episodes'], variant['algo_kwargs']['max_path_length'], ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), variant['algo_kwargs']['reward_scale'], variant['historical_policies_kwargs']['num_historical_policies'], ) exp_id = create_exp_name(exp_id) out_dir = os.path.join(log_dir, exp_id) print('Logging to:', out_dir) setup_logger( log_dir=out_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if variant['intrinsic_reward'] == 'smm': discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if variant['historical_policies_kwargs']['num_historical_policies'] > 0: HistoricalPoliciesHook( base_algorithm=algorithm, log_dir=log_dir, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): intrinsic_reward = variant['intrinsic_reward'] # Create environment. num_skills = variant['smm_kwargs']['num_skills'] if variant[ 'intrinsic_reward'] == 'smm' else 0 env, training_env = create_env(variant['env_id'], variant['env_kwargs'], num_skills) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size # Initialize networks. net_size = variant['net_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, hidden_sizes=[net_size, net_size], output_size=1, ) vf = FlattenMlp( input_size=obs_dim, hidden_sizes=[net_size, net_size], output_size=1, ) policy = TanhGaussianPolicy( obs_dim=obs_dim, hidden_sizes=[net_size, net_size], action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) if intrinsic_reward == 'smm': discriminator = FlattenMlp( input_size=obs_dim - num_skills, hidden_sizes=[net_size, net_size], output_size=num_skills, ) density_model = VAEDensity(input_size=obs_dim, num_skills=num_skills, code_dim=128, **variant['vae_density_kwargs']) # Overwrite appropriate functions of algorithm. smm_algorithm_hook = SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) elif intrinsic_reward == 'icm': embedding_model = FlattenMlp( input_size=obs_dim, hidden_sizes=[net_size, net_size], output_size=net_size, ) forward_model = FlattenMlp( input_size=net_size + action_dim, hidden_sizes=[net_size, net_size], output_size=net_size, ) inverse_model = FlattenMlp( input_size=net_size + net_size, hidden_sizes=[], output_size=action_dim, ) # Overwrite appropriate functions of algorithm. ICMHook(base_algorithm=algorithm, embedding_model=embedding_model, forward_model=forward_model, inverse_model=inverse_model, **variant['icm_kwargs']) elif intrinsic_reward == 'count': count_algorithm_hook = CountHook(base_algorithm=algorithm, **variant['count_kwargs']) elif intrinsic_reward == 'pseudocount': density_model = VAEDensity( input_size=obs_dim, num_skills=0, code_dim=128, **variant['vae_density_kwargs'], ) # Overwrite appropriate functions of algorithm. PseudocountHook(base_algorithm=algorithm, density_model=density_model, **variant['pseudocount_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(log_dir, variant_overwrite, cpu=False): if not cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) # Load experiment from file. env, _, data, variant = load_experiment(log_dir, variant_overwrite) #assert all([a == b for a, b in zip(print(samples)env.sampled_goal, variant['env_kwargs']['goal_prior'])]) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format( variant['algo_kwargs']['num_episodes'], variant['algo_kwargs']['max_path_length'], ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), variant['algo_kwargs']['reward_scale'], variant['historical_policies_kwargs']['num_historical_policies'], ) exp_id = create_exp_name(exp_id) out_dir = os.path.join(log_dir, exp_id) print('Logging to:', out_dir) setup_logger( log_dir=out_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if variant['intrinsic_reward'] == 'smm': discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if variant['historical_policies_kwargs']['num_historical_policies'] > 0: HistoricalPoliciesHook( base_algorithm=algorithm, log_dir=log_dir, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) #algorithm.train() samples = algorithm.get_eval_paths() #for path in samples: # print(path['observations']) #plt.figure() #plt.plot(samples[0]['observations'][:, 0], samples[0]['observations'][:, 1]) #plt.plot(3, 2) #plt.show() print(env.reset()) print(samples[0]['observations']) i = 0 for path in samples: np.save('./outtem/out%i.npy' % i, path['observations']) i = i + 1 #print(algorithm.policy.get_action(np.array([0,0]))) from rlkit.samplers.util import rollout from rlkit.samplers.in_place import InPlacePathSampler #path=rollout(env,algorithm.eval_policy,50) eval_sampler = InPlacePathSampler( env=env, policy=algorithm.eval_policy, max_samples=100, max_path_length=50, ) path = algorithm.eval_sampler.obtain_samples() print(path[0]['observations'])
def experiment(args): if not args.cpu: ptu.set_gpu_mode(True) # optionally set the GPU (default=False) variant_overwrite = dict( # Evaluate model on num_episodes. algo_kwargs=dict( reward_scale=args.reward_scale, collection_mode='episodic', num_episodes=args.num_episodes, max_path_length=args.max_path_length, render=args.render, # Evaluate without additional training num_updates_per_episode=0, min_num_steps_before_training=( args.max_path_length * args.num_episodes + 1), ), # Environment settings env_kwargs=dict( sample_goal=False, goal_prior=args.test_goal, shaped_rewards=[ 'object_off_table', 'object_goal_indicator', 'object_gripper_indicator', 'action_penalty' ], terminate_upon_success=False, terminate_upon_failure=False, ), # SMM settings smm_kwargs=dict( # Posterior adaptation of latent skills p(z) update_p_z_prior_coeff=args.update_p_z_prior_coeff, # Turn off SMM reward. state_entropy_coeff=0, latent_entropy_coeff=0, latent_conditional_entropy_coeff=0, discriminator_lr=0, ), ) # Load experiment from file. env, _, data, variant = load_experiment(args.logdir, variant_overwrite) assert all([a == b for a, b in zip(env.sampled_goal, args.test_goal)]) variant.update(test_goal=list(env.sampled_goal)) if args.num_historical_policies > 0: variant.update(historical_policies_kwargs=dict( log_dir=args.logdir, num_historical_policies=args.num_historical_policies, sample_strategy=args.sample_strategy, on_policy_prob=args.on_policy_prob, )) # Set log directory. exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}-{}-opp{}'.format( args.num_episodes, args.max_path_length, ','.join(variant_overwrite['env_kwargs']['shaped_rewards']), args.reward_scale, args.num_historical_policies, args.sample_strategy, args.on_policy_prob, ) exp_id = create_exp_name(exp_id) log_dir = os.path.join(args.logdir, exp_id) print('Logging to:', log_dir) setup_logger( log_dir=log_dir, variant=variant, snapshot_mode='none', snapshot_gap=50, ) # Load trained model from file. policy = data['policy'] vf = data['vf'] qf = data['qf'] algorithm = SoftActorCritic( env=env, training_env=env, # can't clone box2d env cause of swig save_environment=False, # can't save box2d env cause of swig policy=policy, qf=qf, vf=vf, **variant['algo_kwargs'], ) # Overwrite algorithm for p(z) adaptation (if model is SMM). if 'smm_kwargs' in variant: discriminator = data['discriminator'] density_model = data['density_model'] SMMHook(base_algorithm=algorithm, discriminator=discriminator, density_model=density_model, **variant['smm_kwargs']) # Overwrite algorithm for historical averaging. if args.num_historical_policies > 0: HistoricalPoliciesHook( base_algorithm=algorithm, **variant['historical_policies_kwargs'], ) algorithm.to(ptu.device) algorithm.train()