def get_policy(args, env): N = env.observation_space.shape[0] M = env.action_space.shape[0] if args.init_policy == 'optimal': K = env.optimal_controller() mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) elif args.init_policy == 'linear': K = np.random.randn(M, N) mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) elif args.init_policy == 'linear_bias': K = np.random.randn(M, N) mean_network = nn.Linear(*K.shape[::-1], bias=True) mean_network.weight.data = tensor(K) elif args.init_policy == 'mlp': mean_network = get_mlp((N, ) + tuple(args.hidden_sizes) + (M, ), gate=nn.Tanh) else: raise Exception('unsupported policy type') return GaussianPolicy(N, M, mean_network, learn_std=not args.fix_std, gate_output=args.gate_output)
def zero_transition(t: Transition): result = Transition( state=tensor(torch.zeros(t['state'].shape)), policy=tensor(torch.ones(t['policy'].shape)/t['policy'].shape[0]), action=tensor(0).long(), reward=tensor(0.), new_state=tensor(torch.zeros(t['new_state'].shape))) return result
def min_noise_D_unfaithful(self, D): noise = cp.Variable(1) iden = np.eye(self.dim) state = self.make_state(noise) Na = utils._make_cp_matrix((self.dim, self.dim), False) Nb = utils._make_cp_matrix((self.dim, self.dim), False) constraints = [noise >= 0, noise <= 1, Na >> 0, Nb >> 0] constraints += [utils.tensor(Na, iden) + utils.tensor(iden, Nb) >> state] constraints += [cp.trace(Na)*iden >> (D-1)*Na] constraints += [cp.trace(Nb)*iden >> (D-1)*Nb] constraints += [cp.trace(Na) + cp.trace(Nb) == D-1] pr = cp.Problem(cp.Minimize(noise), constraints) pr.solve(solver=cp.MOSEK) return noise.value[0]
def __call__(self, *args, **kwargs): this_op = self.compute(*args, **kwargs) if self.tensor_with is not None: ops = [T if T is not None else this_op for T in self.tensor_with] return tensor(ops) else: return this_op
def create_batch(batch, n): train_in = tensor(P.cuda_device, n, *P.image_input_size) labels_in = tensor_t(torch.LongTensor, P.cuda_device, n) for j, (im, lab, _) in enumerate(batch): train_in[j] = trans(im) labels_in[j] = labels.index(lab) return [train_in], [labels_in]
def get_embeddings(net, dataset, device, out_size): trans = P.test_trans if P.test_pre_proc: trans = transforms.Compose([]) if not P.embeddings_classify: # remove classifier and add back later classifier = net.classifier net.classifier = nn.Sequential() def batch(last, i, is_final, batch): embeddings = last n = len(batch) test_in = tensor(P.cuda_device, n, *P.image_input_size) for j, (testIm, _, _) in enumerate(batch): test_in[j] = trans(testIm) out = net(Variable(test_in, volatile=True)) # we have the classification values. just normalize out = NormalizeL2Fun()(out) out = out.data for j in range(n): embeddings[i + j] = out[j] return embeddings init = tensor(device, len(dataset), out_size) embeddings = fold_batches(batch, init, dataset, P.test_batch_size) if not P.embeddings_classify: net.classifier = classifier return embeddings
def get_embeddings(net, dataset, device, out_size): test_trans = P.test_trans if P.test_pre_proc: test_trans = transforms.Compose([]) def batch(last, i, is_final, batch): embeddings = last im_trans = test_trans(batch[0][0]) test_in = move_device(im_trans.unsqueeze(0), P.cuda_device) out = net(Variable(test_in, volatile=True))[0].data # first, determine location of highest maximal activation max_pred, _ = out.max(1) max_pred1, max_i1 = max_pred.max(2) _, max_i2 = max_pred1.max(3) i2 = max_i2.view(-1)[0] i1 = max_i1.view(-1)[i2] # we have the indexes of the highest maximal activation, # get the classification values at this point and normalize out = out[:, :, i1, i2] out = NormalizeL2Fun()(Variable(out, volatile=True)) out = out.data embeddings[i] = out[0] return embeddings init = tensor(device, len(dataset), out_size) return fold_batches(batch, init, dataset, 1)
def create_batch(batch, n, epoch, similarities): # one image at a time. batch is always of size 1 train_in1 = tensor(P.cuda_device, n, *P.image_input_size) train_in2 = tensor(P.cuda_device, n, *P.image_input_size) train_in3 = tensor(P.cuda_device, n, *P.image_input_size) labels_in = tensor_t(torch.LongTensor, P.cuda_device, n) # we get positive couples. find negatives for them for j, (lab, (i1, i2), (im1, im2)) in enumerate(batch): im3 = None # choose a semi-hard negative. see FaceNet # paper by Schroff et al for details. # essentially, choose hardest negative that is still # easier than the positive. this should avoid # collapsing the model at beginning of training ind_exl = lab_indicators[lab] sim_pos = similarities[i1, i2] if epoch < P.train_epoch_switch: # exclude all positives as well as any that are # more similar than sim_pos ind_exl = ind_exl | similarities[i1].ge(sim_pos) if ind_exl.sum() >= similarities.size(0): p = 'cant find semi-hard neg for' s = 'falling back to random neg' n_pos = lab_indicators[lab].sum() n_ge = similarities[i1].ge(sim_pos).sum() n_tot = similarities.size(0) print('{0} {1}-{2}-{3} (#pos:{4}, #ge:{5}, #total:{6}), {7}'. format(p, i1, i2, lab, n_pos, n_ge, n_tot, s)) else: # similarities must be in [-1, 1] # set all similarities of excluded indexes to -2 # then take argmax (highest similarity not excluded) sims = similarities[i1].clone() sims[ind_exl] = -2 _, k = sims.max(0) im3 = train_set[k[0]][0] if im3 is None: # default to random negative im3 = choose_rand_neg(train_set, lab) # one image at a time train_in1[j] = train_trans(im1) train_in2[j] = train_trans(im2) train_in3[j] = train_trans(im3) labels_in[j] = labels.index(lab) # return input tensors and labels return [train_in1, train_in2, train_in3], [labels_in]
def __init__(self, N, batch_shape, echo_pulse=True, name='conditional_displacement'): super().__init__(name=name) self.displace = ops.DisplacementOperator( N, tensor_with=[ops.identity(2), None]) self.P = { i: utils.tensor([ops.projector(i, 2), ops.identity(N)]) for i in [0, 1] } self.batch_shape = batch_shape self.qubit_op = utils.tensor([ops.sigma_x(), ops.identity(N) ]) if echo_pulse else ops.identity(2 * N)
def batch(last, i, is_final, batch): embeddings = last test_in = tensor(P.cuda_device, len(batch), *P.image_input_size) for j, (im, _, _) in enumerate(batch): test_in[j] = test_trans(im) out = net(Variable(test_in, volatile=True)).data for j, embedding in enumerate(out): embeddings[i + j] = embedding return embeddings
def batch(last, i, is_final, batch): embeddings = last n = len(batch) test_in = tensor(P.cuda_device, n, *P.image_input_size) for j, (testIm, _, _) in enumerate(batch): test_in[j] = trans(testIm) out = net(Variable(test_in, volatile=True)) # we have the classification values. just normalize out = NormalizeL2Fun()(out) out = out.data for j in range(n): embeddings[i + j] = out[j] return embeddings
def __init__( self, state_dim, action_dim, mean_network, learn_std=True, gate_output=False, ): super().__init__() self._mean = mean_network if learn_std: self._std = nn.Parameter(torch.zeros(action_dim)) else: self._std = tensor(np.ones(action_dim)) self.gate_output = gate_output self.learn_std = learn_std self.to(Config.DEVICE)
def eval_batch_test(last, i, is_final, batch): correct, total = last n = len(batch) test_in = tensor(P.cuda_device, n, *P.image_input_size) for j, (testIm, _, _) in enumerate(batch): test_in[j] = trans(testIm) out = net(Variable(test_in, volatile=True)).data # first get all maximal values for classification # then, use the spatial region with the highest maximal value # to make a prediction _, predicted = torch.max(out, 1) total += n correct += sum( labels.index(testLabel) == predicted[j][0] for j, (_, testLabel, _) in enumerate(batch)) return correct, total
def get_embeddings(net, dataset, device, out_size): test_trans = P.test_trans if P.test_pre_proc: test_trans = transforms.Compose([]) def batch(last, i, is_final, batch): embeddings = last test_in = tensor(P.cuda_device, len(batch), *P.image_input_size) for j, (im, _, _) in enumerate(batch): test_in[j] = test_trans(im) out = net(Variable(test_in, volatile=True)).data for j, embedding in enumerate(out): embeddings[i + j] = embedding return embeddings init = tensor(device, len(dataset), out_size) return fold_batches(batch, init, dataset, P.test_batch_size)
def get_embeddings(net, dataset, device, out_size): test_trans = P.test_trans if P.test_pre_proc: test_trans = transforms.Compose([]) def batch(last, i, is_final, batch): embeddings = last # one image at a time test_in = move_device( test_trans(batch[0][0]).unsqueeze(0), P.cuda_device) out = net(Variable(test_in, volatile=True)).data embeddings[i] = out[0] return embeddings init = tensor(device, len(dataset), out_size) return fold_batches(batch, init, dataset, 1)
def min_noise_to_have_rank_D_or_less(self, D): # Saves some memory by not redefining a cvxpy state here, and rewriting the constraints dT = self.dim**2 * D**2 iden = np.eye(self.dim) proj = utils.tensor(iden, np.sqrt(D)*utils.max_entangled_ket(D), iden) noise = cp.Variable(1) state = self.make_state(noise) sigma = utils._make_cp_matrix((dT, dT), False) constraints = [noise >= 0, noise <= 1] constraints += [sigma >> 0, cp.trace(sigma) == D] constraints += [proj @ sigma @ proj.T == state] constraints += [utils.partial_transpose(sigma, [0], [self.dim*D, self.dim*D]) >> 0] pr = cp.Problem(cp.Minimize(noise), constraints) pr.solve(solver=cp.MOSEK) return noise.value[0]
observation = env.reset() buff = [] done = False score = 0 enemy_score = 0 while not done: env.render() if len(buff) < 4: observation, reward, done, info = env.step(env.action_space.sample()) buff.append(observation) continue x = tensor(preprocess(buff), args.device)[None] action = int(torch.argmax(model(x).detach().cpu())) observation, reward, done, info = env.step(action) buff.pop(0) buff.append(observation) if reward>0: score+=reward else: enemy_score-=reward print('Enemy Score - {}, Our Score - {}'.format(enemy_score, score)) time.sleep(10)
def _step(self): config = self.config if self.state is None: self.random_process.reset_states() self.state = self.task.reset() self.state = config.state_normalizer(self.state) if self.total_steps < config.warm_up: action = to_np(self.task.action_space.sample()) else: action = self.network(self.state) action = to_np(action) action += self.random_process.sample() action = np.clip(action, int(self.task.action_space.low), int(self.task.action_space.high)) next_state, reward, done, info = self.task.step(action) next_state = self.config.state_normalizer(next_state) reward = norm_reward = self.config.reward_normalizer(reward) experiences = list( zip(self.state, action, norm_reward, next_state, done)) self.replay.feed_batch(experiences) if done[0]: self.random_process.reset_states() self.state = next_state self.total_steps += 1 if (self.replay.size() >= config.warm_up): experiences = self.replay.sample() states, actions, rewards, next_states, terminals = experiences states = tensor(states) actions = tensor(actions) rewards = tensor(rewards).unsqueeze(-1) next_states = tensor(next_states) mask = tensor(1 - terminals).unsqueeze(-1) phi_next = self.target_network.feature(next_states) a_next = self.target_network.actor(phi_next) q_next = self.target_network.critic(phi_next, a_next) q_next = self.config.discount * mask * q_next q_next.add_(rewards) q_next = q_next.detach() phi = self.network.feature(states) q = self.network.critic(phi, actions) critic_loss = (q - q_next).pow(2).mul(0.5).sum(-1).mean() self.network.zero_grad() critic_loss.backward() self.network.critic_opt.step() phi = self.network.feature(states) action = self.network.actor(phi) policy_loss = -self.network.critic(phi.detach(), action).mean() self.network.zero_grad() policy_loss.backward() self.network.actor_opt.step() self.soft_update(self.target_network, self.network) return reward, not all(done)
def forward(self, obs, noise): obs = tensor(obs) action = self.mean(obs) + tensor(noise) * self.std return action.cpu().detach().numpy()
def distribution(self, obs): obs = tensor(obs) dist = torch.distributions.Normal(self.mean(obs), self.std) return dist
def train_rollout(self, total_step): storage = Storage(self.episode_C['rollout_length']) state = self.env._copy_state(*self.state) step_times = [] # Sync. self.gnn.load_state_dict(self.shared_gnn.state_dict()) for rollout_step in range(self.episode_C['rollout_length']): start_step_time = time.time() prediction = self.env.propagate(self.gnn, [state]) action = prediction['a'].cpu().numpy()[0] next_state, reward, done, achieved_goal = self.env.step(action, self.ep_step, state) self.ep_step += 1 if done: # Sync local model with shared model at start of each ep self.gnn.load_state_dict(self.shared_gnn.state_dict()) self.ep_step = 0 storage.add(prediction) storage.add({'r': tensor(reward, self.device).unsqueeze(-1).unsqueeze(-1), 'm': tensor(1 - done, self.device).unsqueeze(-1).unsqueeze(-1), 's': state}) state = self.env._copy_state(*next_state) total_step += 1 end_step_time = time.time() step_times.append(end_step_time - start_step_time) self.state = self.env._copy_state(*state) prediction = self.env.propagate(self.gnn, [state]) storage.add(prediction) storage.placeholder() advantages = tensor(np.zeros((1, 1)), self.device) returns = prediction['v'].detach() for i in reversed(range(self.episode_C['rollout_length'])): # Disc. Return returns = storage.r[i] + self.agent_C['discount'] * storage.m[i] * returns # GAE td_error = storage.r[i] + self.agent_C['discount'] * storage.m[i] * storage.v[i + 1] - storage.v[i] advantages = advantages * self.agent_C['gae_tau'] * self.agent_C['discount'] * storage.m[i] + td_error storage.adv[i] = advantages.detach() storage.ret[i] = returns.detach() # print(returns.shape, td_error.shape, advantages.shape, storage.adv[-1].shape, storage.ret[-1].shape) actions, log_probs_old, returns, advantages = storage.cat(['a', 'log_pi_a', 'ret', 'adv']) states = [storage.s[i] for i in range(storage.size)] actions = actions.detach() log_probs_old = log_probs_old.detach() advantages = (advantages - advantages.mean()) / advantages.std() # Train self.gnn.train() batch_times = [] train_pred_times = [] for _ in range(self.agent_C['optimization_epochs']): # Sync. at start of each epoch self.gnn.load_state_dict(self.shared_gnn.state_dict()) sampler = random_sample(np.arange(len(states)), self.agent_C['minibatch_size']) for batch_indices in sampler: start_batch_time = time.time() batch_indices_tensor = tensor(batch_indices, self.device).long() # Important Node: these are tensors but dont have a grad sampled_states = [states[i] for i in batch_indices] sampled_actions = actions[batch_indices_tensor] sampled_log_probs_old = log_probs_old[batch_indices_tensor] sampled_returns = returns[batch_indices_tensor] sampled_advantages = advantages[batch_indices_tensor] start_pred_time = time.time() prediction = self.env.propagate(self.gnn, sampled_states, sampled_actions) end_pred_time = time.time() train_pred_times.append(end_pred_time - start_pred_time) # Calc. Loss ratio = (prediction['log_pi_a'] - sampled_log_probs_old).exp() obj = ratio * sampled_advantages obj_clipped = ratio.clamp(1.0 - self.agent_C['ppo_ratio_clip'], 1.0 + self.agent_C['ppo_ratio_clip']) * sampled_advantages # policy loss and value loss are scalars policy_loss = -torch.min(obj, obj_clipped).mean() - self.agent_C['entropy_weight'] * prediction['ent'].mean() value_loss = self.agent_C['value_loss_coef'] * (sampled_returns - prediction['v']).pow(2).mean() self.opt.zero_grad() (policy_loss + value_loss).backward() if self.agent_C['clip_grads']: nn.utils.clip_grad_norm_(self.gnn.parameters(), self.agent_C['gradient_clip']) ensure_shared_grads(self.gnn, self.shared_gnn) self.opt.step() end_batch_time = time.time() batch_times.append(end_batch_time - start_batch_time) self.gnn.eval() return total_step, np.array(step_times).mean(), np.array(batch_times).mean(), np.array(train_pred_times).mean()
def plot_phase_space(state, tensorstate, phase_space_rep='wigner', lim=4, pts=81, title=None): """ Plot phase space representation of the state. Converts a batch of states to density matrix. Args: state (tf.Tensor([B,N], c64)): batched state vector tensorstate (bool): flag if tensored with qubit phase_space_rep (str): either 'wigner' or 'CF' lim (float): plot limit in displacement units pts (int): number of pixels in each direction title (str): figure title (optional) """ assert len(state.shape) >= 2 and state.shape[1] > 1 # create operators if tensorstate: N = int(state.shape[1] / 2) parity = utils.tensor([ops.identity(2), ops.parity(N)]) D = ops.DisplacementOperator(N, tensor_with=[ops.identity(2), None]) else: N = state.shape[1] D = ops.DisplacementOperator(N) parity = ops.parity(N) # project every trajectory onto |g> subspace if tensorstate: P0 = utils.tensor([ops.projector(0, 2), ops.identity(N)]) state, _ = utils.normalize(tf.linalg.matvec(P0, state)) # make a density matrix dm = utils.density_matrix(state) # Generate a grid of phase space points x = np.linspace(-lim, lim, pts) y = np.linspace(-lim, lim, pts) xs_mesh, ys_mesh = np.meshgrid(x, y, indexing='ij') grid = tf.cast(xs_mesh + 1j * ys_mesh, c64) grid_flat = tf.reshape(grid, [-1]) matmul = tf.linalg.matmul # Calculate and plot the phase space representation if phase_space_rep == 'wigner': displaced_parity = matmul(D(grid_flat), matmul(parity, D(-grid_flat))) W = 1 / pi * tf.linalg.trace(matmul(displaced_parity, dm)) W_grid = tf.reshape(W, grid.shape) fig, ax = plt.subplots(1, 1, dpi=200) fig.suptitle(title) ax.pcolormesh(x, y, np.transpose(W_grid.numpy().real), cmap='RdBu_r', vmin=-1 / pi, vmax=1 / pi) ax.set_aspect('equal') if phase_space_rep == 'CF': C = tf.linalg.trace(matmul(D(grid_flat), dm)) C_grid = tf.reshape(C, grid.shape) fig, axes = plt.subplots(1, 2, sharey=True, dpi=200) fig.suptitle(title) axes[0].pcolormesh(x, y, np.transpose(C_grid.numpy().real), cmap='RdBu_r', vmin=-1, vmax=1) axes[1].pcolormesh(x, y, np.transpose(C_grid.numpy().imag), cmap='RdBu_r', vmin=-1, vmax=1) axes[0].set_title('Re') axes[1].set_title('Im') axes[0].set_aspect('equal') axes[1].set_aspect('equal') plt.tight_layout()
def compare_cost(args): set_seed(args.seed) env = LQR( #N=20, #M=12, init_scale=1.0, max_steps=args.H, # 10, 20 Sigma_s_kappa=1.0, Q_kappa=1.0, P_kappa=1.0, A_norm=1.0, B_norm=1.0, Sigma_s_scale=0.0, ) K = env.optimal_controller() mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) policy = GaussianPolicy(*K.shape[::-1], mean_network, learn_std=False, gate_output=False) # mc mc_costs = [] # individual mc_means = [] # cumulative for i in tqdm(range(args.n_trajs), 'mc'): noises = np.random.randn(env.max_steps, env.M) _, _, rewards, _, _ = rollout(env, policy, noises) mc_costs.append(-rewards.sum()) mc_means.append(np.mean(mc_costs)) # rqmc rqmc_costs = [] rqmc_means = [] rqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'trajwise') for i in tqdm(range(args.n_trajs), 'rqmc'): _, _, rewards, _, _ = rollout(env, policy, rqmc_noises[i]) rqmc_costs.append(-rewards.sum()) rqmc_means.append(np.mean(rqmc_costs)) # array rqmc arqmc_costs_dict = {} arqmc_means_dict = {} arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'ssj') #arqmc_noises = get_rqmc_noises(args.n_trajs, env.max_steps, env.M, 'array') for sorter in args.sorter: arqmc_costs = [] arqmc_means = [] sort_f = get_sorter(sorter, env) data = ArrayRQMCSampler(env, args.n_trajs, sort_f=sort_f).sample(policy, arqmc_noises) for traj in data: rewards = np.asarray(traj['rewards']) arqmc_costs.append(-rewards.sum()) arqmc_means.append(np.mean(arqmc_costs)) arqmc_costs_dict[sorter] = arqmc_costs arqmc_means_dict[sorter] = arqmc_means expected_cost = env.expected_cost(K, np.diag(np.ones(env.M))) mc_errors = np.abs(mc_means - expected_cost) rqmc_errors = np.abs(rqmc_means - expected_cost) arqmc_errors_dict = { sorter: np.abs(arqmc_means - expected_cost) for sorter, arqmc_means in arqmc_means_dict.items() } logger.info('mc: {}, rqmc: {} '.format(mc_errors[-1], rqmc_errors[-1]) + \ ' '.join(['arqmc ({}): {}'.format(sorter, arqmc_errors[-1]) for sorter, arqmc_errors in arqmc_errors_dict.items()])) info = { **vars(args), 'mc_costs': mc_costs, 'rqmc_costs': rqmc_costs, 'arqmc_costs': arqmc_costs } if args.save_fn is not None: with open(args.save_fn, 'wb') as f: dill.dump( dict(mc_errors=mc_errors, rqmc_errors=rqmc_errors, arqmc_errors_dict=arqmc_errors_dict, info=info), f) if args.show_fig: data = pd.concat([ pd.DataFrame({ 'name': 'mc', 'x': np.arange(len(mc_errors)), 'error': mc_errors, }), pd.DataFrame({ 'name': 'rqmc', 'x': np.arange(len(rqmc_errors)), 'error': rqmc_errors, }), pd.concat([ pd.DataFrame({ 'name': 'arqmc_{}'.format(sorter), 'x': np.arange(len(arqmc_errors)), 'error': arqmc_errors, }) for sorter, arqmc_errors in arqmc_errors_dict.items() ]), ]) plot = sns.lineplot(x='x', y='error', hue='name', data=data) plot.set(yscale='log') plt.show() return mc_errors, rqmc_errors, arqmc_errors_dict, info
prev_buff = [] done = False while not done: prev_buff = buff if len(buff) < 4: observation, reward, done, info = env.step(env.action_space.sample()) buff.append(observation) continue previous_state = preprocess(prev_buff) if args.resume_episode != 0 and args.epsilon < random.random(): x = tensor(previous_state, args.device)[None] action = int(torch.argmax(model(x).detach().cpu())) else: action = env.action_space.sample() observation, reward, done, info = env.step(action) buff.pop(0) buff.append(observation) next_state = preprocess(buff) REPLAY_MEMORY.append([previous_state, action, reward, next_state, done]) REPLAY_MEMORY = REPLAY_MEMORY[-args.replay_size:]
def min_noise_reducible(self): sigma = self.make_state(0) sigmaB = utils.partial_trace(sigma, [0], [self.dim, self.dim]) sigmaB_ext = utils.tensor(np.eye(self.dim), sigmaB) eigval = min(np.real(np.linalg.eigvals(sigmaB_ext - sigma))) return eigval / (eigval - (self.dim - 1)/self.dim**2)
def feature(self, obs): obs = tensor(obs) return self.phi_body(obs)
def create_operators(self): N = self.N # oscillator fixed operators self.I = tensor([ops.identity(2), ops.identity(N)]) self.a = tensor([ops.identity(2), ops.destroy(N)]) self.a_dag = tensor([ops.identity(2), ops.create(N)]) self.q = tensor([ops.identity(2), ops.position(N)]) self.p = tensor([ops.identity(2), ops.momentum(N)]) self.n = tensor([ops.identity(2), ops.num(N)]) self.parity = tensor([ops.identity(2), ops.parity(N)]) # qubit fixed operators self.sx = tensor([ops.sigma_x(), ops.identity(N)]) self.sy = tensor([ops.sigma_y(), ops.identity(N)]) self.sz = tensor([ops.sigma_z(), ops.identity(N)]) self.sm = tensor([ops.sigma_m(), ops.identity(N)]) self.H = tensor([ops.hadamard(), ops.identity(N)]) # oscillator parameterized operators tensor_with = [ops.identity(2), None] self.displace = ops.DisplacementOperator(N, tensor_with=tensor_with) self.rotate = ops.RotationOperator(N, tensor_with=tensor_with) # qubit parameterized operators tensor_with = [None, ops.identity(N)] self.rotate_qb_xy = ops.QubitRotationXY(tensor_with=tensor_with) self.rotate_qb_z = ops.QubitRotationZ(tensor_with=tensor_with) # qubit sigma_z measurement projector self.P = { i: tensor([ops.projector(i, 2), ops.identity(N)]) for i in [0, 1] }
optimizer = optim.SGD(params, lr=1e-3, momentum=.9, weight_decay=1e-6) writer = SummaryWriter() reward_normalizer = RewardNormalizer() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ global_t = 0 for ep in range(10000): # episode loop # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ trajectory = [] ep_t = 0 state = tensor(env.reset()) ep_return = 0 while True: with torch.no_grad(): policy = policy_net(state) action = np.random.choice(env.action_space.n, p=policy.cpu().numpy()) new_state, reward, done, _ = env.step(action) new_state = tensor(new_state) ep_return += reward # reward = reward_normalizer.transform_reward(reward) transition = Transition(state=state, policy=policy,
def compare_grad(args): set_seed(args.seed) env = LQR( N=args.xu_dim[0], M=args.xu_dim[1], lims=100, init_scale=1.0, max_steps=args.H, Sigma_s_kappa=1.0, Q_kappa=1.0, P_kappa=1.0, A_norm=1.0, B_norm=1.0, Sigma_s_scale=args.noise, ) #K = env.optimal_controller() K = np.random.randn(env.M, env.N) mean_network = nn.Linear(*K.shape[::-1], bias=False) mean_network.weight.data = tensor(K) policy = GaussianPolicy(*K.shape[::-1], mean_network, learn_std=False, gate_output=False) out_set = set() # here Sigma_a = np.diag(np.ones(env.M)) mc_grads = [] for i in tqdm(range(args.n_trajs), 'mc'): noises = np.random.randn(env.max_steps, env.M) states, actions, rewards, _, _ = rollout(env, policy, noises) if len(states) < args.H: out_set.add('mc') break mc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) mc_grads = np.asarray(mc_grads) mc_means = np.cumsum(mc_grads, axis=0) / np.arange( 1, len(mc_grads) + 1)[:, np.newaxis, np.newaxis] rqmc_grads = [] #loc = torch.zeros(env.max_steps * env.M) #scale = torch.ones(env.max_steps * env.M) #rqmc_noises = Normal_RQMC(loc, scale).sample(torch.Size([args.n_trajs])).data.numpy() rqmc_noises = uniform2normal( random_shift( ssj_uniform( args.n_trajs, args.H * env.M, ).reshape(args.n_trajs, args.H, env.M), 0, )) for i in tqdm(range(args.n_trajs), 'rqmc'): states, actions, rewards, _, _ = rollout( env, policy, rqmc_noises[i].reshape(env.max_steps, env.M)) if len(states) < args.H: out_set.add('rqmc') break rqmc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) rqmc_grads = np.asarray(rqmc_grads) rqmc_means = np.cumsum(rqmc_grads, axis=0) / np.arange( 1, len(rqmc_grads) + 1)[:, np.newaxis, np.newaxis] arqmc_means_dict = {} #arqmc_noises = get_rqmc_noises(args.n_trajs, args.H, env.M, 'array') uniform_noises = ssj_uniform(args.n_trajs, env.M) # n_trajs , action_dim arqmc_noises = uniform2normal( random_shift(np.expand_dims(uniform_noises, 1).repeat(args.H, 1), 0)) # n_trajs, horizon, action_dim for sorter in args.sorter: arqmc_grads = [] sort_f = get_sorter(sorter, env, K) data = ArrayRQMCSampler(env, args.n_trajs, sort_f=sort_f).sample(policy, arqmc_noises) for traj in data: states, actions, rewards = np.asarray(traj['states']), np.asarray( traj['actions']), np.asarray(traj['rewards']) if len(states) < args.H: out_set.add('arqmc_{}'.format(sorter)) break arqmc_grads.append( get_gaussian_policy_gradient(states, actions, rewards, policy, variance_reduced_loss)) arqmc_grads = np.asarray(arqmc_grads) arqmc_means = np.cumsum(arqmc_grads, axis=0) / np.arange( 1, len(arqmc_grads) + 1)[:, np.newaxis, np.newaxis] arqmc_means_dict[sorter] = arqmc_means expected_grad = env.expected_policy_gradient(K, Sigma_a) mc_errors = [np.nan] if 'mc' in out_set else (( mc_means - expected_grad)**2).reshape(mc_means.shape[0], -1).mean( 1) # why the sign is reversed? rqmc_errors = [np.nan] if 'rqmc' in out_set else ( (rqmc_means - expected_grad)**2).reshape(rqmc_means.shape[0], -1).mean(1) arqmc_errors_dict = { sorter: [np.nan] if 'arqmc_{}'.format(sorter) in out_set else ((arqmc_means - expected_grad)**2).reshape(arqmc_means.shape[0], -1).mean(1) for sorter, arqmc_means in arqmc_means_dict.items() } info = { **vars(args), 'out': out_set, 'expected_grad': expected_grad, 'means': { 'mc': mc_means, 'rqmc': rqmc_means, **arqmc_means_dict, }, } if args.save_fn is not None: with open(save_fn, 'wb') as f: dill.dump( dict(mc_errors=mc_errors, rqmc_errors=rqmc_errors, arqmc_errors_dict=arqmc_errors_dict, info=info), f) if args.show_fig: mc_data = pd.DataFrame({ 'name': 'mc', 'x': np.arange(len(mc_errors)), 'error': mc_errors, }) rqmc_data = pd.DataFrame({ 'name': 'rqmc', 'x': np.arange(len(rqmc_errors)), 'error': rqmc_errors, }) arqmc_data = pd.concat([ pd.DataFrame({ 'name': 'arqmc_{}'.format(sorter), 'x': np.arange(len(arqmc_errors)), 'error': arqmc_errors, }) for sorter, arqmc_errors in arqmc_errors_dict.items() ]) plot = sns.lineplot(x='x', y='error', hue='name', data=pd.concat([mc_data, rqmc_data, arqmc_data])) plot.set(yscale='log') plt.show() return mc_errors, rqmc_errors, arqmc_errors_dict, info