action_scale = {'beta':3/8, 'phi':pi/8} to_learn = {'beta':True, 'phi':True} train_batch_size = 10 eval_batch_size = 1000 learn_residuals = True train_episode_length = lambda x: env_kwargs['T'] eval_episode_length = lambda x: env_kwargs['T'] # Create drivers for data collection from rl_tools.agents import dynamic_episode_driver_sim_env collect_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv( env_kwargs, reward_kwargs, train_batch_size, action_script, action_scale, to_learn, train_episode_length, learn_residuals) eval_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv( env_kwargs, reward_kwargs_eval, eval_batch_size, action_script, action_scale, to_learn, eval_episode_length, learn_residuals) PPO.train_eval( root_dir = root_dir, random_seed = 4, num_epochs = 300, # Params for train normalize_observations = True, normalize_rewards = False, discount_factor = 1.0,
'stabilizer_translations': [sqrt(pi) + 0j, 2j * sqrt(pi)] } # Params for action wrapper action_script = 'v1_phase_estimation_X_prep_4round' action_scale = {'alpha': 1, 'beta': 1, 'phi': pi} to_learn = {'alpha': True, 'beta': False, 'phi': True} train_batch_size = 1000 eval_batch_size = 1000 # Create drivers for data collection from rl_tools.agents import dynamic_episode_driver_sim_env collect_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv( env_kwargs, reward_kwargs, train_batch_size, action_script, action_scale, to_learn) eval_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv( env_kwargs, reward_kwargs, eval_batch_size, action_script, action_scale, to_learn) PPO.train_eval( root_dir=root_dir, random_seed=0, num_epochs=10000, # Params for train normalize_observations=True, normalize_rewards=False, discount_factor=1.0, lr=1e-4,