def test_stochastic_generator_preprocessor(self, uniform_sampling, eps): """Input & output check for stochastic_generator_preprocessor(). The consistency of the estimated average gradient is checked by: //benchmarks/scripts/differentiators:convergence_test""" n_qubits = 5 n_programs = 3 symbol_names = ['a', 'b'] programs, symbol_values_tensor, n_symbols, n_shifts = \ _example_circuit_helper(n_qubits, n_programs) new_programs_before, weights_before, shifts_before, \ n_param_gates_before = parameter_shift_util.parse_programs( programs, symbol_names, symbol_values_tensor, n_symbols) new_programs, weights, shifts, n_param_gates = \ sd_util.stochastic_generator_preprocessor( new_programs_before, weights_before, shifts_before, n_programs, n_symbols, n_param_gates_before, n_shifts, uniform_sampling) # n_param_gates should be 1 because the only one generator is sampled. self.assertEqual(n_param_gates, 1, "n_param_gates should be 1") ground_truth_shape = np.array( [n_symbols, n_programs, n_param_gates, n_shifts], dtype=np.int32) tf.assert_equal(ground_truth_shape, tf.shape(new_programs)) tf.assert_equal(ground_truth_shape, tf.shape(weights)) tf.assert_equal(ground_truth_shape, tf.shape(shifts)) # Estimate probability of sampling each shifts ground_truth_shifts = [[[1.5707964, -1.5707964], [0.5235988, -0.5235988], [0.31415927, -0.31415927]], [[0.21460181, 1.7853982], [0.6073009, 1.3926991], [1.0, 1.0]]] if uniform_sampling: ground_truth_pdist = [[0.333333, 0.333333, 0.333333], [0.5, 0.5, 0.0]] else: ground_truth_pdist = [[0.111111, 0.333333, 0.555555], [0.333333, 0.666666, 0.0]] shifts_hist = np.zeros((n_symbols, n_programs)) n_samples = 700 for _ in range(n_samples): _, _, shifts, _ = \ sd_util.stochastic_generator_preprocessor( new_programs_before, weights_before, shifts_before, n_programs, n_symbols, n_param_gates_before, n_shifts, uniform_sampling) for i, shifts_per_symbol in enumerate(shifts): for s in shifts_per_symbol: # per program loc = np.where(np.isclose(ground_truth_shifts, s))[1][0] shifts_hist[i][loc] += 1.0 shifts_pdist = shifts_hist / n_samples / n_programs self.assertAllClose(ground_truth_pdist, shifts_pdist, atol=eps, rtol=eps)
def get_gradient_circuits(self, programs, symbol_names, symbol_values): """See base class description.""" # these get used a lot n_symbols = tf.gather(tf.shape(symbol_names), 0) n_programs = tf.gather(tf.shape(programs), 0) # Assume cirq.decompose() generates gates with at most two distinct # eigenvalues, which results in two parameter shifts. n_shifts = 2 # These new_programs are parameter shifted. # shapes: [n_symbols, n_programs, n_param_gates, n_shifts] (new_programs, weights, shifts, n_param_gates) = parameter_shift_util.parse_programs( programs, symbol_names, symbol_values, n_symbols) m_tile = n_shifts * n_param_gates * n_symbols # Transpose to correct shape, # [n_programs, n_symbols, n_param_gates, n_shifts], # then reshape to the correct batch size batch_programs = tf.reshape(tf.transpose(new_programs, [1, 0, 2, 3]), [n_programs, m_tile]) batch_weights = tf.reshape( tf.transpose(weights, [1, 0, 2, 3]), [n_programs, n_symbols, n_param_gates * n_shifts]) shifts = tf.reshape(tf.transpose(shifts, [1, 0, 2, 3]), [n_programs, m_tile, 1]) # Append impurity symbol into symbol name new_symbol_names = tf.concat([ symbol_names, tf.constant([parameter_shift_util.PARAMETER_IMPURITY_NAME]) ], 0) # Symbol values are the input symbol values, tiled according to # `batch_programs`, with the shift values appended. tiled_symbol_values = tf.tile(tf.expand_dims(symbol_values, 1), [1, m_tile, 1]) batch_symbol_values = tf.concat([tiled_symbol_values, shifts], 2) single_program_mapper = tf.reshape( tf.range(n_symbols * n_param_gates * n_shifts), [n_symbols, n_param_gates * n_shifts]) batch_mapper = tf.tile(tf.expand_dims(single_program_mapper, 0), [n_programs, 1, 1]) return (batch_programs, new_symbol_names, batch_symbol_values, batch_weights, batch_mapper)
def differentiate_analytic(self, programs, symbol_names, symbol_values, pauli_sums, forward_pass_vals, grad): """Calculate the gradient. The gradient calculations follows the following steps: 1. Compute the decomposition of the incoming circuits so that we have their generator information (done using cirq in a tf.py_function) 2. Use formula (31) from paper inside of TensorFlow to calculate gradients from all the decomposed circuits. 3. Sum up terms and reshape for the total gradient that is compatible with TensorFlow. **CAUTION** Analytic gradient measurements based on this ParameterShift generally run at least K(=2) times SLOWER than the original circuit. On top of it, since all parameters of gates are shifted individually, the time complexity is linear in the number of parameterized gates L. So, you will see O(KL) slower time & space complexity than the original forward pass measurements. Args: programs: `tf.Tensor` of strings with shape [batch_size] containing the string representations of the circuits to be executed. symbol_names: `tf.Tensor` of strings with shape [n_params], which is used to specify the order in which the values in `symbol_values` should be placed inside of the circuits in `programs`. symbol_values: `tf.Tensor` of real numbers with shape [batch_size, n_params] specifying parameter values to resolve into the circuits specified by programs, following the ordering dictated by `symbol_names`. pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops] containing the string representation of the operators that will be used on all of the circuits in the expectation calculations. forward_pass_vals: `tf.Tensor` of real numbers with shape [batch_size, n_ops] containing the output of the forward pass through the op you are differentiating. grad: `tf.Tensor` of real numbers with shape [batch_size, n_ops] representing the gradient backpropagated to the output of the op you are differentiating through. Returns: Backward gradient values for each program & each pauli sum. It has the shape of [batch_size, n_symbols]. """ # these get used a lot n_symbols = tf.gather(tf.shape(symbol_names), 0) n_programs = tf.gather(tf.shape(programs), 0) n_ops = tf.gather(tf.shape(pauli_sums), 1) # Assume cirq.decompose() generates gates with at most two distinct # eigenvalues, which results in two parameter shifts. n_shifts = 2 # STEP 1: Generate required inputs for executor # Deserialize programs and parse the whole parameterized gates # new_programs has [n_symbols, n_param_gates, n_shifts, n_programs]. # These new_programs has programs that parameter-shift rule is applied, # so those programs has (new_programs, weights, shifts, n_param_gates) = parameter_shift_util.parse_programs( programs, symbol_names, symbol_values, n_symbols) # Reshape & transpose new_programs, weights and shifts to fit into # the input format of tensorflow_quantum simulator. # [n_symbols, n_param_gates, n_shifts, n_programs] new_programs = tf.transpose(new_programs, [0, 2, 3, 1]) weights = tf.transpose(weights, [0, 2, 3, 1]) shifts = tf.transpose(shifts, [0, 2, 3, 1]) # reshape everything to fit into expectation op correctly total_programs = n_programs * n_shifts * n_param_gates * n_symbols # tile up and then reshape to order programs correctly flat_programs = tf.reshape(new_programs, [total_programs]) flat_shifts = tf.reshape(shifts, [total_programs]) # tile up and then reshape to order ops correctly n_tile = n_shifts * n_param_gates * n_symbols flat_perturbations = tf.concat([ tf.reshape( tf.tile(tf.expand_dims(symbol_values, 0), tf.stack([n_tile, 1, 1])), [total_programs, n_symbols]), tf.expand_dims(flat_shifts, axis=1) ], axis=1) flat_ops = tf.reshape( tf.tile(tf.expand_dims(pauli_sums, 0), tf.stack([n_tile, 1, 1])), [total_programs, n_ops]) # Append impurity symbol into symbol name new_symbol_names = tf.concat([ symbol_names, tf.expand_dims(tf.constant( parameter_shift_util._PARAMETER_IMPURITY_NAME), axis=0) ], axis=0) # STEP 2: calculate the required expectation values expectations = self.expectation_op(flat_programs, new_symbol_names, flat_perturbations, flat_ops) # STEP 3: generate gradients according to the results # we know the rows are grouped according to which parameter # was perturbed, so reshape to reflect that grouped_expectations = tf.reshape( expectations, [n_symbols, n_shifts * n_programs * n_param_gates, -1]) # now we can calculate the partial of the circuit output with # respect to each perturbed parameter def rearrange_expectations(grouped): def split_vertically(i): return tf.slice(grouped, [i * n_programs, 0], [n_programs, n_ops]) return tf.map_fn(split_vertically, tf.range(n_param_gates * n_shifts), dtype=tf.float32) # reshape so that expectations calculated on different programs are # separated by a dimension rearranged_expectations = tf.map_fn(rearrange_expectations, grouped_expectations) # now we will calculate all of the partial derivatives partials = tf.einsum( 'spco,spc->sco', rearranged_expectations, tf.cast( tf.reshape(weights, [n_symbols, n_param_gates * n_shifts, n_programs]), rearranged_expectations.dtype)) # now apply the chain rule return tf.einsum('sco,co -> cs', partials, grad)
def test_parse_programs(self): """Input & output check for parse_programs().""" n_qubits = 5 n_programs = 3 n_shifts = 2 symbol_names = ['a', 'b'] n_symbols = len(symbol_names) sympy_symbols = [sympy.Symbol(s) for s in symbol_names] coeff = [1.0, -2.0, 3.0, -4.0, 5.0] # Test circuit. # (0, 0): ───Rz(1.0*a)──── # # (0, 1): ───Rz(-2.0*b)─── # # (0, 2): ───Rz(3.0*a)──── # # (0, 3): ───Rz(-4.0*b)─── # # (0, 4): ───Rz(5.0*a)──── q = cirq.GridQubit.rect(1, n_qubits) c = cirq.Circuit() c.append([ cirq.Rz(coeff[i] * sympy_symbols[i % 2]).on(q[i]) for i in range(n_qubits) ]) circuit_batch = [c] * n_programs symbol_values_array = np.array( [[i for i, _ in enumerate(symbol_names)] for _ in range(n_programs)], dtype=np.float32) symbol_values_tensor = tf.convert_to_tensor(symbol_values_array) programs = util.convert_to_tensor(circuit_batch) new_programs, weights, shifts, n_param_gates = \ parameter_shift_util.parse_programs( programs, symbol_names, symbol_values_tensor, n_symbols) # shape check ground_truth_shape = [n_symbols, n_programs, n_param_gates, n_shifts] tf.assert_equal(ground_truth_shape, tf.shape(new_programs)) tf.assert_equal(ground_truth_shape, tf.shape(weights)) tf.assert_equal(ground_truth_shape, tf.shape(shifts)) # value check (1) weights # the first 1x3x3x2 are +/- coefficients of Rz gates with symbol 'a'. # they are divided by 2 in Rz. # [:,:,:,0] have original coefficient and [:,:,:,1] are their negatives. # the second 1x3x3x2 are with symbol 'b'. As we know, there are only # 2 'b' symbols, which makes [1,:,2,:] are zeros. (padded) ground_truth_weights = np.array([[[[0.5, -0.5], [1.5, -1.5], [2.5, -2.5]], [[0.5, -0.5], [1.5, -1.5], [2.5, -2.5]], [[0.5, -0.5], [1.5, -1.5], [2.5, -2.5]]], [[[-1., 1.], [-2., 2.], [0., -0.]], [[-1., 1.], [-2., 2.], [0., -0.]], [[-1., 1.], [-2., 2.], [0., -0.]]]]) self.assertAllClose(ground_truth_weights, weights) # value check (2) shifts # Please ignore this divide-by-zero warning because it is intended. ground_truth_shifts = np.divide(1, ground_truth_weights) / 4.0 * np.pi new_symbol_values_array = np.tile( np.expand_dims(np.expand_dims(np.transpose(symbol_values_array, [1, 0]), axis=-1), axis=-1), [1, 1, 3, 2]) # All inf's should be 0.0. This happens inside parse_programs() # with tf.math.divide_no_nan() without any warning. ground_truth_shifts[np.where(np.isinf(ground_truth_shifts))] = 0.0 ground_truth_shifts = new_symbol_values_array + ground_truth_shifts self.assertAllClose(ground_truth_shifts, shifts)
def test_stochastic_coordinate_preprocessor(self, uniform_sampling, eps): """Input & output check for stochastic_coordinate_preprocessor(). The consistency of the estimated average gradient is checked by: //benchmarks/scripts/differentiators:convergence_test""" n_qubits = 5 n_programs = 3 symbol_names = ['a', 'b'] programs, symbol_values_tensor, n_symbols, n_shifts = \ _example_circuit_helper(n_qubits, n_programs) n_ops = 2 ops, psums, _ = _example_ops_helper(n_programs, n_ops) new_programs, weights_before, shifts, n_param_gates = \ parameter_shift_util.parse_programs( programs, symbol_names, symbol_values_tensor, n_symbols) # all inputs should be tensorflow tensors. with self.assertRaises(ValueError): # symbol_values_array is used instead of symbol_values_tensor. sd_util.stochastic_coordinate_preprocessor( new_programs, symbol_values_tensor.numpy(), ops, weights_before, shifts, n_programs, n_symbols, n_param_gates, n_shifts, n_ops, uniform_sampling) # psums is used instead of ops. sd_util.stochastic_coordinate_preprocessor( new_programs, symbol_values_tensor, psums, weights_before, shifts, n_programs, n_symbols, n_param_gates, n_shifts, n_ops, uniform_sampling) flat_programs, flat_perturbations, flat_ops, _, weights, \ coordinate_relocator = \ sd_util.stochastic_coordinate_preprocessor( new_programs, symbol_values_tensor, ops, weights_before, shifts, n_programs, n_symbols, n_param_gates, n_shifts, n_ops, uniform_sampling) # n_symbols should not be 1 because it doesn't fit the input format of # expectation_op or sampling_op. total_programs = n_programs * n_param_gates * n_shifts # flat_programs should have n_programs * n_param_gates * n_shifts * 1 # because only one symbol is sampled now. self.assertAllClose([total_programs], tf.shape(flat_programs), atol=eps, rtol=eps) # perturbation symbol is added, so the number of symbol should be # n_symbol+1 self.assertAllClose([total_programs, n_symbols + 1], tf.shape(flat_perturbations), atol=eps, rtol=eps) # shape check on flat_ops. self.assertAllClose([total_programs, n_ops], tf.shape(flat_ops), atol=eps, rtol=eps) # resampled weights is in # [n_symbols, n_param_gates, n_shifts, n_programs] self.assertAllClose([n_symbols, n_param_gates, n_shifts, n_programs], tf.shape(weights), atol=eps, rtol=eps) # resampled coordinate_relocator is in [total_programs, n_symbols] self.assertAllClose([total_programs, n_symbols], tf.shape(coordinate_relocator), atol=eps, rtol=eps) # Estimate probability of sampling each shifts ground_truth_shifts = [[ 1.5707964, -1.5707964, 0.5235988, -0.5235988, 0.31415927, -0.31415927 ], [0.21460181, 1.7853982, 0.6073009, 1.3926991, 1.0, 1.0]] ground_truth_pdist = [0.6, 0.4] shifts_hist = np.zeros((n_symbols, )) n_samples = 700 cnt = 0.0 for _ in range(n_samples): _, flat_perturbations, _, _, _, _ = \ sd_util.stochastic_coordinate_preprocessor( new_programs, symbol_values_tensor, ops, weights_before, shifts, n_programs, n_symbols, n_param_gates, n_shifts, n_ops, uniform_sampling) for s in flat_perturbations[:, -1]: # See only shift symbols. sym = np.where(np.isclose(ground_truth_shifts, s))[0][0] shifts_hist[sym] += 1.0 cnt += 1.0 shifts_pdist = shifts_hist / cnt self.assertAllClose(ground_truth_pdist, shifts_pdist, atol=eps, rtol=eps)
def differentiate_sampled(self, programs, symbol_names, symbol_values, pauli_sums, num_samples, forward_pass_vals, grad): """Compute the sampled gradient with cascaded stochastic processes. The gradient calculations follows the following steps: 1. Compute the decomposition of the incoming circuits so that we have their generator information (done using cirq in a tf.py_function) 2. Construct probability distributions & perform stochastic processes to select parameter-shift terms. - Stochastic generator : sampling on parameter-shifted gates. - Stochastic coordinate : sampling on symbols. - Stochastic cost : sampling on pauli sums 3. Sum up terms and reshape for the total gradient that is compatible with tensorflow differentiation. Args: programs: `tf.Tensor` of strings with shape [n_programs] containing the string representations of the circuits to be executed. symbol_names: `tf.Tensor` of strings with shape [n_symbols], which is used to specify the order in which the values in `symbol_values` should be placed inside of the circuits in `programs`. symbol_values: `tf.Tensor` of real numbers with shape [n_programs, n_symbols] specifying parameter values to resolve into the circuits specified by programs, following the ordering dictated by `symbol_names`. num_samples: `tf.Tensor` of positive integers representing the number of samples per term in each term of pauli_sums used during the forward pass. pauli_sums : `tf.Tensor` of strings with shape [n_programs, n_ops] representing output observables for each program. forward_pass_vals : `tf.Tensor` of real numbers for forward pass values with the shape of [n_programs, n_ops] grad : `tf.Tensor` of real numbers for backpropagated gradient values from the upper layer with the shape of [n_programs, n_ops] Returns: A `tf.Tensor` of real numbers for sampled gradients from the above samplers with the shape of [n_programs, n_symbols] """ n_symbols = tf.gather(tf.shape(symbol_values), 1) n_programs = tf.gather(tf.shape(programs), 0) n_ops = tf.gather(tf.shape(pauli_sums), 1) n_shifts = 2 # STEP 1: Generate required inputs for executor by using parsers # Deserialize programs and parse the whole parameterized gates # new_programs has [n_symbols, n_programs, n_param_gates, n_shifts]. new_programs, weights, shifts, n_param_gates = \ parameter_shift_util.parse_programs( programs, symbol_names, symbol_values, n_symbols) if self.stochastic_generator: # Result : [n_symbols, n_programs, n_param_gates=1, n_shifts]. new_programs, weights, shifts, n_param_gates = \ sd_util.stochastic_generator_preprocessor( new_programs, weights, shifts, n_programs, n_symbols, n_param_gates, n_shifts, self.uniform_sampling) # Reshape & transpose new_programs, weights and shifts to fit into # the input format of tensorflow_quantum simulator. # [n_symbols, n_param_gates, n_shifts, n_programs] new_programs = tf.transpose(new_programs, [0, 2, 3, 1]) weights = tf.transpose(weights, [0, 2, 3, 1]) shifts = tf.transpose(shifts, [0, 2, 3, 1]) if self.stochastic_cost: # Result : pauli_sums [n_programs, n_ops] -> [n_programs, n_ops=1] pauli_sums, cost_relocator, n_ops = \ sd_util.stochastic_cost_preprocessor( pauli_sums, n_programs, n_ops, self.uniform_sampling) if self.stochastic_coordinate: flat_programs, flat_perturbations, flat_ops, flat_num_samples, \ weights, coordinate_relocator = \ sd_util.stochastic_coordinate_preprocessor( new_programs, symbol_values, pauli_sums, weights, shifts, n_programs, n_symbols, n_param_gates, n_shifts, n_ops, self.uniform_sampling, num_samples=num_samples) else: # reshape everything to fit into expectation op correctly total_programs = n_programs * n_shifts * n_symbols * n_param_gates # tile up and then reshape to order programs correctly flat_programs = tf.reshape(new_programs, [total_programs]) flat_shifts = tf.reshape(shifts, [total_programs]) # tile up and then reshape to order ops correctly n_tile = n_shifts * n_symbols * n_param_gates flat_perturbations = tf.concat([ tf.reshape( tf.tile(tf.expand_dims(symbol_values, 0), tf.stack([n_tile, 1, 1])), [total_programs, n_symbols]), tf.expand_dims(flat_shifts, axis=1) ], axis=1) flat_ops = tf.reshape( tf.tile(tf.expand_dims(pauli_sums, 0), tf.stack([n_tile, 1, 1])), [total_programs, n_ops]) flat_num_samples = tf.reshape( tf.tile(tf.expand_dims(num_samples, 0), tf.stack([n_tile, 1, 1])), [total_programs, n_ops]) # Append impurity symbol into symbol name new_symbol_names = tf.concat([ symbol_names, tf.expand_dims(tf.constant( parameter_shift_util._PARAMETER_IMPURITY_NAME), axis=0) ], axis=0) # STEP 2: calculate the required expectation values expectations = self.expectation_op(flat_programs, new_symbol_names, flat_perturbations, flat_ops, flat_num_samples) # STEP 3: generate gradients according to the results if self.stochastic_coordinate: # Transpose to the original shape # [n_symbols, n_programs, n_param_gates, n_shifts] # # coordinate_relocator has [sub_total_programs, n_symbols](=ij) # expectations has [sub_total_programs, n_ops](=ik) # einsum -> [n_ops, n_symbols, sub_total_programs](=kji) expectations = tf.einsum( 'ij,ik->kji', tf.cast(coordinate_relocator, dtype=tf.float64), tf.cast(expectations, dtype=tf.float64)) # Transpose to [n_symbols, sub_total_programs, n_ops] expectations = tf.transpose(expectations, [1, 2, 0]) # we know the rows are grouped according to which parameter # was perturbed, so reshape to reflect that grouped_expectations = tf.reshape( tf.cast(expectations, dtype=tf.float64), [n_symbols, n_shifts * n_programs * n_param_gates, -1]) # now we can calculate the partial of the circuit output with # respect to each perturbed parameter def rearrange_expectations(grouped): def split_vertically(i): return tf.slice(grouped, [i * n_programs, 0], [n_programs, n_ops]) return tf.map_fn(split_vertically, tf.range(n_param_gates * n_shifts), dtype=tf.float64) # reshape so that expectations calculated on different programs are # separated by a dimension rearranged_expectations = tf.map_fn(rearrange_expectations, grouped_expectations, dtype=tf.float64) # now we will calculate all of the partial derivatives # s: symbol, p: perturbation, c: circuit, o: ops partials = tf.einsum( 'spco,spc->sco', rearranged_expectations, tf.cast(tf.reshape( weights, [n_symbols, n_param_gates * n_shifts, n_programs]), dtype=tf.float64)) if self.stochastic_cost: # Reshape to the original n_ops shape # partials: [n_symbols, n_programs, n_ops=1] # cost_relocator: [n_programs, original_n_ops] # Result: [n_symbols, n_programs, original_n_ops] partials = partials * tf.stop_gradient( tf.cast(cost_relocator, dtype=tf.float64)) # now apply the chain rule # cast partials back to float32 return tf.cast( tf.einsum('sco,co -> cs', partials, tf.cast(grad, dtype=tf.float64)), tf.float32)