def __init__(self, env_kwargs, reward_kwargs, batch_size, action_script, action_scale, to_learn, episode_length, learn_residuals=False, remote=False): """ Args: env_kwargs (dict): optional parameters for training environment. reward_kwargs (dict): optional parameters for reward function. batch_size (int): number of episodes collected in parallel. action_script (str): name of action script. Action wrapper will select actions from this script if they are not learned. action_scale (dict, str:float): dictionary mapping action dimensions to scaling factors. Action wrapper will rescale actions produced by the agent's neural net policy by these factors. to_learn (dict, str:bool): dictionary mapping action dimensions to bool flags. Specifies if the action should be learned or scripted. episode_length (callable: int -> int): function that defines the schedule for training episode durations. Takes as argument int epoch number and returns int episode duration for this epoch. learn_residuals (bool): flag to learn residual over the scripted protocol. If False, will learn actions from scratch. If True, will learn a residual to be added to scripted protocol. remote (bool): flag for remote environment to close the connection to a client upon finishing the training. """ self.episode_length = episode_length self.remote = remote # Create training env and wrap it env = env_init(batch_size=batch_size, reward_kwargs=reward_kwargs, **env_kwargs) module_name = 'rl_tools.action_script.' + action_script action_script = importlib.import_module(module_name) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn, learn_residuals=learn_residuals) # create dummy placeholder policy to initialize parent class dummy_policy = PolicyPlaceholder(env.time_step_spec(), env.action_spec()) super().__init__(env, dummy_policy, num_episodes=batch_size)
def __init__(self, env_kwargs_list, rew_kwargs_list, batch_size, action_script, action_scale, to_learn, episode_length_list, env_schedule=None): """ Args: env_kwargs_list (list[dict]): list of parameters for training environment. reward_kwargs_list (list[dict]): list of parameters for reward functions. Should correspond to 'env_kwargs_list'. batch_size (int): number of episodes collected in parallel. action_script (str): name of action script. Action wrapper will select actions from this script if they are not learned. action_scale (dict, str:float): dictionary mapping action dimensions to scaling factors. Action wrapper will rescale actions produced by the agent's neural net policy by these factors. to_learn (dict, str:bool): dictionary mapping action dimensions to bool flags. Specifies if the action should be learned or scripted. episode_length_list (list[callable: int -> int]): list of schedule functions for episode durations. Schedule functions take as argument int epoch number and return int episode duration for this epoch. The list should correspond to 'env_kwargs_list'. env_schedule (callable): function mapping epoch number to index of the environment from the list to use during this epoch """ self.env_list, self.driver_list = [], [] self.episode_length_list = episode_length_list for env_kwargs, rew_kwargs in zip(env_kwargs_list, rew_kwargs_list): # Create training env and wrap it env = env_init(batch_size=batch_size, reward_kwargs=rew_kwargs, **env_kwargs) action_script_m = action_scripts.__getattribute__(action_script) env = wrappers.ActionWrapper(env, action_script_m, action_scale, to_learn) # create dummy placeholder policy to initialize driver dummy_policy = PolicyPlaceholder( env.time_step_spec(), env.action_spec()) # create driver for this environment driver = dynamic_episode_driver.DynamicEpisodeDriver( env, dummy_policy, num_episodes=batch_size) self.env_list.append(env) self.driver_list.append(driver) if env_schedule is None: # regularly switch between environments self.env_schedule = lambda epoch: epoch % len(self.env_list) else: self.env_schedule = env_schedule
# Define Kerr sweep range and Kerr-dependent parameters Kerr = np.linspace(1, 51, 11) t_gate = 1.2e-6 / np.sqrt(Kerr) # assume gate time can scale as 1/chi rotation_angle = 2 * np.pi * Kerr * (1.2e-6 + t_gate) * 20 # simple heuristic states = ['X+', 'Y+', 'Z+'] lifetimes = {state: np.zeros(len(Kerr)) for state in states} savepath = r'E:\VladGoogleDrive\Qulab\GKP\sims\Kerr\hexagonal_sweep\no_rotation_perfect_qubit' # Initialize environment and policy env = env_init(control_circuit='oscillator', encoding='hexagonal', init='X+', H=1, batch_size=2000, episode_length=200, reward_mode='fidelity', quantum_circuit_type='v2') from rl_tools.action_script import hexagonal_phase_estimation_symmetric_6round as action_script policy = plc.ScriptedPolicy(env.time_step_spec(), action_script) for k in range(len(Kerr)): env = env_init(control_circuit='oscillator', encoding='hexagonal', init='X+', H=1, batch_size=2000,
reward_kwargs = {'reward_mode': 'zero'} # Params for environment env_kwargs = { 'control_circuit': 'snap_and_displacement', 'init': 'vac', 'T': min_T, 'N': 100 } # Params for action wrapper action_script = 'snap_and_displacements' action_scale = {'alpha': 4, 'theta': pi} to_learn = {'alpha': True, 'theta': True} env = env_init(batch_size=1, **env_kwargs, episode_length=env_kwargs['T']) action_script_obj = importlib.import_module('rl_tools.action_script.' + action_script) env = wrappers.ActionWrapper(env, action_script_obj, action_scale, to_learn) action_names = list(to_learn.keys()) all_actions = {a: [] for a in action_names} time_step = env.reset() policy_state = policy.get_initial_state(env.batch_size) max_alpha, max_n = 0, 0 while not time_step.is_last(): action_step = policy.action(time_step, policy_state) policy_state = action_step.state
self.beta = [b_amp + 0j, 1j * b_amp] * 2 + [eps + 0j, 1j * eps] self.alpha = [a_amp + 0j ] + [-1j * delta, delta + 0j] * 2 + [-1j * a_amp] self.phi = [pi / 2] * 6 #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- env = env_init(control_circuit='oscillator', init='Z+', H=1, batch_size=800, episode_length=60, reward_mode='fidelity', quantum_circuit_type='v2', encoding='square') savepath = r'E:\VladGoogleDrive\Qulab\GKP\sims\osc_sims\test' feedback_amps = np.linspace(0.15, 0.24, 10, dtype=complex) trim_amps = np.linspace(0.15, 0.24, 10, dtype=complex) states = ['Z+'] make_figure = False #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- lifetimes = np.zeros((len(feedback_amps), len(trim_amps)))
data_dir = os.path.join(train_dir, 'K' + str(K), 'policy') policy_dir = os.path.join(data_dir, '000080000') policy = tf.compat.v2.saved_model.load(policy_dir) # Additional simulation parameters kwargs = {'K_osc' : K, 't_gate' : 1.2e-6/np.sqrt(np.sqrt(K)), 'T1_osc' : 250e-6} if 'perfect' in names[i]: kwargs['control_circuit'] = 'oscillator' else: kwargs['control_circuit'] = 'oscillator_qubit' T1_qb = int(names[i][:names[i].find('us')]) kwargs['T1_qb'] = T1_qb*1e-6 # Initialize environment env = env_init(init='X+', H=1, T=6, attn_step=1, batch_size=3000, episode_length=200, reward_mode = 'fidelity', quantum_circuit_type='v2', encoding='hexagonal', **kwargs) env = wrappers.ActionWrapper(env, action_script, to_learn) # Fit logical lifetime fit_params = hf.fit_logical_lifetime(env, policy, plot=False, reps=1, states=['X+'], save_dir=data_dir) T1[names[i]].append(fit_params['X+'][1]*1e6) # convert to us # Plot things fig, ax = plt.subplots(1,1, figsize=(7,4)) ax.set_title(r'Hexagonal code, $t_{gate}\propto 1\,/\,\sqrt[4]{Kerr}$') ax.set_ylabel(r'Logical lifetime ($\,\mu s\, $)') ax.set_xlabel('Kerr (Hz)') for i in range(len(names)): color = palette(i//2)
# Evaluation environment params eval_env_kwargs = { 'control_circuit': 'ECD_control_remote', 'init': 'vac', 'T': 11, 'N': 100 } # Create a target state with a quick simulation of the ECDC sequence from rl_tools.tf_env import env_init from rl_tools.tf_env import policy as plc env = env_init(control_circuit='ECD_control', reward_kwargs=dict(reward_mode='zero'), init='vac', T=env_kwargs['T'], batch_size=1, N=100, episode_length=env_kwargs['T']) from rl_tools.action_script import ECD_control_residuals_GKP_plusZ as action_script policy = plc.ScriptedPolicy(env.time_step_spec(), action_script) time_step = env.reset() policy_state = policy.get_initial_state(env.batch_size) while not time_step.is_last()[0]: action_step = policy.action(time_step, policy_state) policy_state = action_step.state time_step = env.step(action_step.action) target_state = env.info['psi_cached']
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = 'true' os.environ["CUDA_VISIBLE_DEVICES"] = "0" import numpy as np import matplotlib.pyplot as plt from rl_tools.tf_env import policy as plc from rl_tools.tf_env import env_init from time import time import tensorflow as tf from math import sqrt, pi # initialize environment and policy env = env_init(control_circuit='oscillator', init='Z+', H=1, batch_size=2000, episode_length=30, reward_mode='fidelity', quantum_circuit_type='v2') from rl_tools.action_script import phase_estimation_symmetric_with_trim_4round as action_script policy = plc.ScriptedPolicy(env.time_step_spec(), action_script) # collect trajectories all_obs = [] reps = 5 # serialize if batch size is small due to memory issues for i in range(reps): time_step = env.reset() policy_state = policy.get_initial_state(env.batch_size) counter = 0
mpl.rcParams['lines.markersize'] = markersize mpl.rcParams['lines.markeredgewidth'] = linewidth / 2 mpl.rcParams['legend.markerscale'] = 2.0 #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- ### Initialize the environment and simulation/training parameters N = 40 env = env_init(control_circuit='snap_and_displacement', channel='quantum_jumps', init='vac', H=1, T=3, attn_step=1, batch_size=1, N=N, episode_length=3, phase_space_rep='wigner') action_script = 'snap_and_displacements' action_scale = {'alpha': 4, 'theta': pi} to_learn = {'alpha': True, 'theta': True} action_script = action_scripts.__getattribute__(action_script) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn) root_dir = { 'bin0': r'E:\data\gkp_sims\PPO\examples\bin0_state_prep_lr3e-4', 'bin1': r'E:\data\gkp_sims\PPO\examples\bin1_state_prep_lr3e-4'
'alpha': [delta + 0j, -1j * delta], 'beta': [b_amp + 0j, 1j * b_amp], 'phi': [pi / 2] * 2, 'theta': [0.0] * 2 } #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- env = env_init(control_circuit='oscillator', init='Z+', H=1, batch_size=1000, episode_length=200, reward_mode='fidelity', channel='diffusion', quantum_circuit_type='v2', encoding='square', N=200) savepath = r'E:\VladGoogleDrive\Qulab\GKP\sims\diffusion_channel' params = [0j] + list(np.linspace(0.0, 0.5, 11, dtype=complex)) make_figure = True #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- lifetimes = np.zeros(len(params))
from time import time from tensorflow.keras.backend import batch_dot from math import sqrt, pi import tensorflow as tf from scipy.optimize import curve_fit from rl_tools.tf_env import helper_functions as hf #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # initialize environment and policy env = env_init(control_circuit='oscillator', init='X+', H=1, batch_size=6000, episode_length=31, reward_mode='zero', quantum_circuit_type='v2', encoding='square') from rl_tools.action_script import phase_estimation_symmetric_with_trim_4round as action_script policy = plc.ScriptedPolicy(env.time_step_spec(), action_script) translations = np.linspace(-sqrt(pi), sqrt(pi), 100) T = env.episode_length R = np.zeros([len(translations), T, T]) # correlation matrix (empty) for k, a in enumerate(translations): # collect trajectories time_step = env.reset()
from rl_tools.tf_env import tf_env_wrappers as wrappers from rl_tools.tf_env import env_init from rl_tools.tf_env import policy as plc import rl_tools.action_script as action_scripts #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- ### Initialize env and policy env = env_init(control_circuit='gkp_qec_autonomous_sBs_osc_qb', reward_kwargs={ 'reward_mode': 'fidelity', 'code_flips': True }, init='X+', H=1, T=2, attn_step=1, batch_size=100, episode_length=12, encoding='square') action_script = 'gkp_qec_autonomous_sBs_2round' action_scale = {'beta': 1, 'phi': pi, 'eps1': 1, 'eps2': 1} to_learn = {'beta': True, 'phi': False, 'eps1': True, 'eps2': True} action_script = action_scripts.__getattribute__(action_script) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn) root_dir = r'E:\data\gkp_sims\PPO\examples\gkp_qec_autonomous_sBs' policy_dir = r'policy\001100' policy = tf.compat.v2.saved_model.load(os.path.join(root_dir, policy_dir))
@author: Vladimir Sivak """ import os os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]='true' os.environ["CUDA_VISIBLE_DEVICES"]="0" # append parent 'gkp-rl' directory to path import sys sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir))) import qutip as qt from rl_tools.tf_env import env_init from rl_tools.remote_env_tools.remote_env_tools import Client # Create environment that will produce mock measurement outcomes env = env_init(control_circuit='ECD_control', reward_kwargs={'reward_mode' : 'zero'}, init='vac', T=8, batch_size=10, N=100, episode_length=8) # connect to the agent client_socket = Client() (host, port) = '172.28.142.46', 5555 client_socket.connect((host, port)) # training loop done = False while not done: # receive action data from the agent message, done = client_socket.recv_data() if done: break action_batch = message['action_batch'] mini_buffer = message['mini_buffer'] N_msmt = message['N_msmt']
import numpy as np import tensorflow as tf import matplotlib.pyplot as plt from time import time from math import pi from rl_tools.tf_env import tf_env_wrappers as wrappers from rl_tools.tf_env import policy as plc from rl_tools.tf_env import env_init #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- env = env_init(control_circuit='gkp_qec_autonomous_sBs_osc_qb', reward_kwargs={'reward_mode':'zero'}, init='vac', H=1, T=2, attn_step=1, batch_size=2000, episode_length=60, encoding='square') from rl_tools.action_script import gkp_qec_autonomous_sBs_2round as action_script policy = plc.ScriptedPolicy(env.time_step_spec(), action_script) #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- #----------------------------------------------------------------------------- # # What operators to measure # names = [r'Re($S_1$)', r'Re($S_2$)', # r'Im($S_1$)', r'Im($S_2$)'] # # Translation amplitudes # stabilizers = [np.sqrt(pi), 2j*np.sqrt(pi)]*2 # # Qubit measurement angles # angles = [0]*2 + [-pi/2]*2
import numpy as np from math import pi, sqrt import matplotlib.pyplot as plt from rl_tools.tf_env import policy as plc from rl_tools.tf_env import helper_functions as hf from rl_tools.tf_env import tf_env_wrappers as wrappers from rl_tools.tf_env import env_init from simulator.utils import expectation import rl_tools.action_script as action_scripts import importlib if 1: env = env_init(control_circuit='snap_and_displacement', encoding='gkp_square', reward_kwargs={'reward_mode': 'zero'}, init='Z+', T=4, batch_size=1, N=150, episode_length=4) action_script = 'snap_and_displacements' action_scale = {'alpha': 4, 'theta': pi} to_learn = {'alpha': True, 'theta': True} module_name = 'rl_tools.action_script.' + action_script action_script = importlib.import_module(module_name) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn) root_dir = r'E:\data\gkp_sims\PPO\paper_data\gates\test\seed2' policy_dir = r'policy\004000' policy = tf.compat.v2.saved_model.load(os.path.join(root_dir, policy_dir))
action_script = 'snap_and_displacements' action_scale = {'alpha':4, 'theta':pi} to_learn = {'alpha':True, 'theta':True} action_script = action_scripts.__getattribute__(action_script) protocol = 'ideal' max_epochs = 3000 gate_times = [0.4e-6, 3.4e-6] seeds = ['seed2'] rewards = {t:{} for t in gate_times} norms = {t:{} for t in gate_times} for t in gate_times: env = env_init(**env_kwargs, reward_kwargs=reward_kwargs) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn) env._env.SNAP_miscalibrated.T = t env._env.bit_string = None # '00000' # collect episodes with different policies for sim_name in seeds: #os.listdir(root_dir[protocol]): print(sim_name) rewards[t][sim_name] = [] norms[t][sim_name] = [] sim_dir = os.path.join(root_dir[protocol], sim_name) for policy_name in os.listdir(os.path.join(sim_dir, 'policy')): if int(policy_name) > max_epochs: break policy_dir = os.path.join(sim_dir, 'policy', policy_name) policy = tf.compat.v2.saved_model.load(policy_dir)
avg_ideal_stabilizer, delta_effective = {}, {} for Delta in deltas: # from rl_tools.tf_env import helper_functions as hf # target_state = hf.GKP_1D_state(False, 200, Delta*sqrt(2)) # reward_kwargs = {'reward_mode' : 'overlap', # 'target_state' : target_state, # 'postselect_0' : False} reward_kwargs = {'reward_mode' : 'stabilizers_v2', 'Delta' : 0.0, 'beta' : sqrt(pi), 'sample' : False} env = env_init(control_circuit='snap_and_displacement', reward_kwargs=reward_kwargs, init='vac', T=9, batch_size=1, N=200, episode_length=9) action_script = 'snap_and_displacements' action_scale = {'alpha':6, 'theta':pi} to_learn = {'alpha':True, 'theta':True} module_name = 'rl_tools.action_script.' + action_script action_script = importlib.import_module(module_name) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn) delta_dir = os.path.join(root_dir, 'delta' + str(Delta)) seed_dir = os.path.join(delta_dir, best_seed[Delta]) policy_dir = r'policy\010000' policy = tf.compat.v2.saved_model.load(os.path.join(seed_dir,policy_dir))
lifetimes = np.zeros(len(params)) returns = np.zeros(len(params)) gfig, gax = plt.subplots(1, 1, dpi=300, figsize=(10, 6)) gax.set_title(r'Reward curves') gax.set_ylabel(r'Reward') gax.set_xlabel('Time') for j in range(len(params)): t = time() env = env_init(control_circuit='oscillator_qubit', init='Z+', H=1, batch_size=2500, episode_length=200, reward_mode='fidelity', quantum_circuit_type='v2', encoding='hexagonal', t_feedback=params[j]) action_script = ActionScript() policy = plc.ScriptedPolicy(env.time_step_spec(), action_script) for state in states: if '_env' in env.__dir__(): env._env.init = state else: env.init = state # Collect batch of episodes time_step = env.reset()