PPO.train_eval( root_dir=save_dir, random_seed=0, # Params for collect num_iterations=4000, train_batch_size=1000, replay_buffer_capacity=70000, # Params for train normalize_observations=True, normalize_rewards=False, discount_factor=1.0, lr=1e-4, lr_schedule=None, num_policy_epochs=20, initial_adaptive_kl_beta=0.0, kl_cutoff_factor=0, importance_ratio_clipping=0.1, value_pred_loss_coef=0.005, # Params for log, eval, save eval_batch_size=600, eval_interval=100, save_interval=1000, checkpoint_interval=5000, summary_interval=100, # Params for environment simulate=simulate, horizon=1, clock_period=6, attention_step=1, train_episode_length=lambda x: 36 if x < 1000 else 64, eval_episode_length=64, reward_mode='pauli', encoding='hexagonal', quantum_circuit_type='v2', action_script='hexagonal_phase_estimation_symmetric_6round', to_learn=to_learn, # Policy and value networks ActorNet=actor_distribution_network_gkp.ActorDistributionNetworkGKP, actor_fc_layers=(100, 50), value_fc_layers=(100, 50), use_rnn=False, **kwargs)
PPO.train_eval( root_dir=root_dir, random_seed=0, num_epochs=3000, # Params for train normalize_observations=True, normalize_rewards=False, discount_factor=1.0, lr=1e-4, lr_schedule=None, num_policy_updates=20, initial_adaptive_kl_beta=0.0, kl_cutoff_factor=0, importance_ratio_clipping=0.1, value_pred_loss_coef=0.005, # Params for log, eval, save eval_interval=10, save_interval=100, checkpoint_interval=10000, summary_interval=10, # Params for data collection train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, collect_driver=collect_driver, eval_driver=eval_driver, replay_buffer_capacity=2000, # Policy and value networks ActorNet=actor_distribution_network_gkp.ActorDistributionNetworkGKP, actor_fc_layers=(), value_fc_layers=(), use_rnn=False, actor_lstm_size=(12, ), value_lstm_size=(12, ))
PPO.train_eval( root_dir=root_dir, random_seed=0, # Params for collect num_iterations=100000, train_batch_size=100, replay_buffer_capacity=15000, # Params for train normalize_observations=True, normalize_rewards=False, discount_factor=1.0, lr=3e-4, lr_schedule=None, num_policy_epochs=20, initial_adaptive_kl_beta=0.0, kl_cutoff_factor=0, importance_ratio_clipping=0.1, value_pred_loss_coef=0.005, # Params for log, eval, save eval_batch_size=1000, eval_interval=100, save_interval=500, checkpoint_interval=5000, summary_interval=100, # Params for environment simulate='Alec_universal_gate_set', horizon=1, clock_period=6, attention_step=1, train_episode_length=lambda x: 6, eval_episode_length=6, init_state='vac', reward_kwargs=reward_kwargs, encoding='square', action_script='Alec_universal_gate_set_6round', to_learn={ 'alpha': True, 'beta': True, 'phi': True }, # Policy and value networks ActorNet=actor_distribution_network_gkp.ActorDistributionNetworkGKP, actor_fc_layers=(), value_fc_layers=(), use_rnn=True, actor_lstm_size=(12, ), value_lstm_size=(12, ), **kwargs)