def test_stochastic_generator_preprocessor(self, uniform_sampling, eps):
        """Input & output check for stochastic_generator_preprocessor().
        The consistency of the estimated average gradient is checked by:
        //benchmarks/scripts/differentiators:convergence_test"""
        n_qubits = 5
        n_programs = 3
        symbol_names = ['a', 'b']

        programs, symbol_values_tensor, n_symbols, n_shifts = \
            _example_circuit_helper(n_qubits, n_programs)

        new_programs_before, weights_before, shifts_before, \
        n_param_gates_before = parameter_shift_util.parse_programs(
            programs, symbol_names, symbol_values_tensor, n_symbols)

        new_programs, weights, shifts, n_param_gates = \
            sd_util.stochastic_generator_preprocessor(
                new_programs_before, weights_before, shifts_before, n_programs,
                n_symbols, n_param_gates_before, n_shifts, uniform_sampling)

        # n_param_gates should be 1 because the only one generator is sampled.
        self.assertEqual(n_param_gates, 1, "n_param_gates should be 1")
        ground_truth_shape = np.array(
            [n_symbols, n_programs, n_param_gates, n_shifts], dtype=np.int32)
        tf.assert_equal(ground_truth_shape, tf.shape(new_programs))
        tf.assert_equal(ground_truth_shape, tf.shape(weights))
        tf.assert_equal(ground_truth_shape, tf.shape(shifts))

        # Estimate probability of sampling each shifts
        ground_truth_shifts = [[[1.5707964, -1.5707964],
                                [0.5235988, -0.5235988],
                                [0.31415927, -0.31415927]],
                               [[0.21460181, 1.7853982],
                                [0.6073009, 1.3926991], [1.0, 1.0]]]
        if uniform_sampling:
            ground_truth_pdist = [[0.333333, 0.333333, 0.333333],
                                  [0.5, 0.5, 0.0]]
        else:
            ground_truth_pdist = [[0.111111, 0.333333, 0.555555],
                                  [0.333333, 0.666666, 0.0]]

        shifts_hist = np.zeros((n_symbols, n_programs))
        n_samples = 700
        for _ in range(n_samples):
            _, _, shifts, _ = \
                sd_util.stochastic_generator_preprocessor(
                    new_programs_before, weights_before, shifts_before,
                    n_programs, n_symbols, n_param_gates_before, n_shifts,
                    uniform_sampling)
            for i, shifts_per_symbol in enumerate(shifts):
                for s in shifts_per_symbol:  # per program
                    loc = np.where(np.isclose(ground_truth_shifts, s))[1][0]
                    shifts_hist[i][loc] += 1.0

        shifts_pdist = shifts_hist / n_samples / n_programs
        self.assertAllClose(ground_truth_pdist,
                            shifts_pdist,
                            atol=eps,
                            rtol=eps)
Esempio n. 2
0
    def get_gradient_circuits(self, programs, symbol_names, symbol_values):
        """See base class description."""
        # these get used a lot
        n_symbols = tf.gather(tf.shape(symbol_names), 0)
        n_programs = tf.gather(tf.shape(programs), 0)

        # Assume cirq.decompose() generates gates with at most two distinct
        # eigenvalues, which results in two parameter shifts.
        n_shifts = 2

        # These new_programs are parameter shifted.
        # shapes: [n_symbols, n_programs, n_param_gates, n_shifts]
        (new_programs, weights, shifts,
         n_param_gates) = parameter_shift_util.parse_programs(
             programs, symbol_names, symbol_values, n_symbols)

        m_tile = n_shifts * n_param_gates * n_symbols

        # Transpose to correct shape,
        # [n_programs, n_symbols, n_param_gates, n_shifts],
        # then reshape to the correct batch size
        batch_programs = tf.reshape(tf.transpose(new_programs, [1, 0, 2, 3]),
                                    [n_programs, m_tile])
        batch_weights = tf.reshape(
            tf.transpose(weights, [1, 0, 2, 3]),
            [n_programs, n_symbols, n_param_gates * n_shifts])
        shifts = tf.reshape(tf.transpose(shifts, [1, 0, 2, 3]),
                            [n_programs, m_tile, 1])

        # Append impurity symbol into symbol name
        new_symbol_names = tf.concat([
            symbol_names,
            tf.constant([parameter_shift_util.PARAMETER_IMPURITY_NAME])
        ], 0)

        # Symbol values are the input symbol values, tiled according to
        # `batch_programs`, with the shift values appended.
        tiled_symbol_values = tf.tile(tf.expand_dims(symbol_values, 1),
                                      [1, m_tile, 1])
        batch_symbol_values = tf.concat([tiled_symbol_values, shifts], 2)

        single_program_mapper = tf.reshape(
            tf.range(n_symbols * n_param_gates * n_shifts),
            [n_symbols, n_param_gates * n_shifts])
        batch_mapper = tf.tile(tf.expand_dims(single_program_mapper, 0),
                               [n_programs, 1, 1])

        return (batch_programs, new_symbol_names, batch_symbol_values,
                batch_weights, batch_mapper)
Esempio n. 3
0
    def differentiate_analytic(self, programs, symbol_names, symbol_values,
                               pauli_sums, forward_pass_vals, grad):
        """Calculate the gradient.

        The gradient calculations follows the following steps:

        1. Compute the decomposition of the incoming circuits so that we have
            their generator information (done using cirq in a tf.py_function)
        2. Use formula (31) from paper inside of TensorFlow to calculate
            gradients from all the decomposed circuits.
        3. Sum up terms and reshape for the total gradient that is compatible
            with TensorFlow.

        **CAUTION**
        Analytic gradient measurements based on this ParameterShift generally
        run at least K(=2) times SLOWER than the original circuit.
        On top of it, since all parameters of gates are shifted individually,
        the time complexity is linear in the number of parameterized gates L.
        So, you will see O(KL) slower time & space complexity than the original
        forward pass measurements.

        Args:
            programs: `tf.Tensor` of strings with shape [batch_size] containing
                the string representations of the circuits to be executed.
            symbol_names: `tf.Tensor` of strings with shape [n_params], which
                is used to specify the order in which the values in
                `symbol_values` should be placed inside of the circuits in
                `programs`.
            symbol_values: `tf.Tensor` of real numbers with shape
                [batch_size, n_params] specifying parameter values to resolve
                into the circuits specified by programs, following the ordering
                dictated by `symbol_names`.
            pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops]
                containing the string representation of the operators that will
                be used on all of the circuits in the expectation calculations.
            forward_pass_vals: `tf.Tensor` of real numbers with shape
                [batch_size, n_ops] containing the output of the forward pass
                through the op you are differentiating.
            grad: `tf.Tensor` of real numbers with shape [batch_size, n_ops]
                representing the gradient backpropagated to the output of the
                op you are differentiating through.

        Returns:
            Backward gradient values for each program & each pauli sum. It has
            the shape of [batch_size, n_symbols].
        """

        # these get used a lot
        n_symbols = tf.gather(tf.shape(symbol_names), 0)
        n_programs = tf.gather(tf.shape(programs), 0)
        n_ops = tf.gather(tf.shape(pauli_sums), 1)
        # Assume cirq.decompose() generates gates with at most two distinct
        # eigenvalues, which results in two parameter shifts.
        n_shifts = 2

        # STEP 1: Generate required inputs for executor
        # Deserialize programs and parse the whole parameterized gates
        # new_programs has [n_symbols, n_param_gates, n_shifts, n_programs].
        # These new_programs has programs that parameter-shift rule is applied,
        # so those programs has
        (new_programs, weights, shifts,
         n_param_gates) = parameter_shift_util.parse_programs(
             programs, symbol_names, symbol_values, n_symbols)

        # Reshape & transpose new_programs, weights and shifts to fit into
        # the input format of tensorflow_quantum simulator.
        # [n_symbols, n_param_gates, n_shifts, n_programs]
        new_programs = tf.transpose(new_programs, [0, 2, 3, 1])
        weights = tf.transpose(weights, [0, 2, 3, 1])
        shifts = tf.transpose(shifts, [0, 2, 3, 1])

        # reshape everything to fit into expectation op correctly
        total_programs = n_programs * n_shifts * n_param_gates * n_symbols
        # tile up and then reshape to order programs correctly
        flat_programs = tf.reshape(new_programs, [total_programs])
        flat_shifts = tf.reshape(shifts, [total_programs])

        # tile up and then reshape to order ops correctly
        n_tile = n_shifts * n_param_gates * n_symbols
        flat_perturbations = tf.concat([
            tf.reshape(
                tf.tile(tf.expand_dims(symbol_values, 0),
                        tf.stack([n_tile, 1, 1])), [total_programs, n_symbols]),
            tf.expand_dims(flat_shifts, axis=1)
        ],
                                       axis=1)
        flat_ops = tf.reshape(
            tf.tile(tf.expand_dims(pauli_sums, 0), tf.stack([n_tile, 1, 1])),
            [total_programs, n_ops])
        # Append impurity symbol into symbol name
        new_symbol_names = tf.concat([
            symbol_names,
            tf.expand_dims(tf.constant(
                parameter_shift_util._PARAMETER_IMPURITY_NAME),
                           axis=0)
        ],
                                     axis=0)

        # STEP 2: calculate the required expectation values
        expectations = self.expectation_op(flat_programs, new_symbol_names,
                                           flat_perturbations, flat_ops)

        # STEP 3: generate gradients according to the results

        # we know the rows are grouped according to which parameter
        # was perturbed, so reshape to reflect that
        grouped_expectations = tf.reshape(
            expectations,
            [n_symbols, n_shifts * n_programs * n_param_gates, -1])

        # now we can calculate the partial of the circuit output with
        # respect to each perturbed parameter
        def rearrange_expectations(grouped):

            def split_vertically(i):
                return tf.slice(grouped, [i * n_programs, 0],
                                [n_programs, n_ops])

            return tf.map_fn(split_vertically,
                             tf.range(n_param_gates * n_shifts),
                             dtype=tf.float32)

        # reshape so that expectations calculated on different programs are
        # separated by a dimension
        rearranged_expectations = tf.map_fn(rearrange_expectations,
                                            grouped_expectations)

        # now we will calculate all of the partial derivatives
        partials = tf.einsum(
            'spco,spc->sco', rearranged_expectations,
            tf.cast(
                tf.reshape(weights,
                           [n_symbols, n_param_gates * n_shifts, n_programs]),
                rearranged_expectations.dtype))

        # now apply the chain rule
        return tf.einsum('sco,co -> cs', partials, grad)
    def test_parse_programs(self):
        """Input & output check for parse_programs()."""
        n_qubits = 5
        n_programs = 3
        n_shifts = 2
        symbol_names = ['a', 'b']
        n_symbols = len(symbol_names)
        sympy_symbols = [sympy.Symbol(s) for s in symbol_names]
        coeff = [1.0, -2.0, 3.0, -4.0, 5.0]
        # Test circuit.
        # (0, 0): ───Rz(1.0*a)────
        #
        # (0, 1): ───Rz(-2.0*b)───
        #
        # (0, 2): ───Rz(3.0*a)────
        #
        # (0, 3): ───Rz(-4.0*b)───
        #
        # (0, 4): ───Rz(5.0*a)────
        q = cirq.GridQubit.rect(1, n_qubits)
        c = cirq.Circuit()
        c.append([
            cirq.Rz(coeff[i] * sympy_symbols[i % 2]).on(q[i])
            for i in range(n_qubits)
        ])
        circuit_batch = [c] * n_programs
        symbol_values_array = np.array(
            [[i for i, _ in enumerate(symbol_names)]
             for _ in range(n_programs)],
            dtype=np.float32)

        symbol_values_tensor = tf.convert_to_tensor(symbol_values_array)
        programs = util.convert_to_tensor(circuit_batch)

        new_programs, weights, shifts, n_param_gates = \
            parameter_shift_util.parse_programs(
                programs, symbol_names, symbol_values_tensor, n_symbols)

        # shape check
        ground_truth_shape = [n_symbols, n_programs, n_param_gates, n_shifts]
        tf.assert_equal(ground_truth_shape, tf.shape(new_programs))
        tf.assert_equal(ground_truth_shape, tf.shape(weights))
        tf.assert_equal(ground_truth_shape, tf.shape(shifts))

        # value check (1) weights
        # the first 1x3x3x2 are +/- coefficients of Rz gates with symbol 'a'.
        # they are divided by 2 in Rz.
        # [:,:,:,0] have original coefficient and [:,:,:,1] are their negatives.
        # the second 1x3x3x2 are with symbol 'b'. As we know, there are only
        # 2 'b' symbols, which makes [1,:,2,:] are zeros. (padded)
        ground_truth_weights = np.array([[[[0.5, -0.5], [1.5, -1.5],
                                           [2.5, -2.5]],
                                          [[0.5, -0.5], [1.5, -1.5],
                                           [2.5, -2.5]],
                                          [[0.5, -0.5], [1.5, -1.5],
                                           [2.5, -2.5]]],
                                         [[[-1., 1.], [-2., 2.], [0., -0.]],
                                          [[-1., 1.], [-2., 2.], [0., -0.]],
                                          [[-1., 1.], [-2., 2.], [0., -0.]]]])
        self.assertAllClose(ground_truth_weights, weights)
        # value check (2) shifts
        # Please ignore this divide-by-zero warning because it is intended.
        ground_truth_shifts = np.divide(1, ground_truth_weights) / 4.0 * np.pi
        new_symbol_values_array = np.tile(
            np.expand_dims(np.expand_dims(np.transpose(symbol_values_array,
                                                       [1, 0]),
                                          axis=-1),
                           axis=-1), [1, 1, 3, 2])
        # All inf's should be 0.0. This happens inside parse_programs()
        # with tf.math.divide_no_nan() without any warning.
        ground_truth_shifts[np.where(np.isinf(ground_truth_shifts))] = 0.0
        ground_truth_shifts = new_symbol_values_array + ground_truth_shifts
        self.assertAllClose(ground_truth_shifts, shifts)
    def test_stochastic_coordinate_preprocessor(self, uniform_sampling, eps):
        """Input & output check for stochastic_coordinate_preprocessor().
        The consistency of the estimated average gradient is checked by:
        //benchmarks/scripts/differentiators:convergence_test"""
        n_qubits = 5
        n_programs = 3
        symbol_names = ['a', 'b']

        programs, symbol_values_tensor, n_symbols, n_shifts = \
            _example_circuit_helper(n_qubits, n_programs)

        n_ops = 2
        ops, psums, _ = _example_ops_helper(n_programs, n_ops)

        new_programs, weights_before, shifts, n_param_gates = \
            parameter_shift_util.parse_programs(
                programs, symbol_names, symbol_values_tensor, n_symbols)

        # all inputs should be tensorflow tensors.
        with self.assertRaises(ValueError):
            # symbol_values_array is used instead of symbol_values_tensor.
            sd_util.stochastic_coordinate_preprocessor(
                new_programs, symbol_values_tensor.numpy(), ops,
                weights_before, shifts, n_programs, n_symbols, n_param_gates,
                n_shifts, n_ops, uniform_sampling)
            # psums is used instead of ops.
            sd_util.stochastic_coordinate_preprocessor(
                new_programs, symbol_values_tensor, psums, weights_before,
                shifts, n_programs, n_symbols, n_param_gates, n_shifts, n_ops,
                uniform_sampling)

        flat_programs, flat_perturbations, flat_ops, _, weights, \
        coordinate_relocator = \
            sd_util.stochastic_coordinate_preprocessor(
                new_programs, symbol_values_tensor, ops, weights_before,
                shifts, n_programs, n_symbols, n_param_gates, n_shifts,
                n_ops, uniform_sampling)

        # n_symbols should not be 1 because it doesn't fit the input format of
        # expectation_op or sampling_op.
        total_programs = n_programs * n_param_gates * n_shifts
        # flat_programs should have n_programs * n_param_gates * n_shifts * 1
        # because only one symbol is sampled now.
        self.assertAllClose([total_programs],
                            tf.shape(flat_programs),
                            atol=eps,
                            rtol=eps)
        # perturbation symbol is added, so the number of symbol should be
        # n_symbol+1
        self.assertAllClose([total_programs, n_symbols + 1],
                            tf.shape(flat_perturbations),
                            atol=eps,
                            rtol=eps)
        # shape check on flat_ops.
        self.assertAllClose([total_programs, n_ops],
                            tf.shape(flat_ops),
                            atol=eps,
                            rtol=eps)
        # resampled weights is in
        # [n_symbols, n_param_gates, n_shifts, n_programs]
        self.assertAllClose([n_symbols, n_param_gates, n_shifts, n_programs],
                            tf.shape(weights),
                            atol=eps,
                            rtol=eps)
        # resampled coordinate_relocator is in [total_programs, n_symbols]
        self.assertAllClose([total_programs, n_symbols],
                            tf.shape(coordinate_relocator),
                            atol=eps,
                            rtol=eps)

        # Estimate probability of sampling each shifts
        ground_truth_shifts = [[
            1.5707964, -1.5707964, 0.5235988, -0.5235988, 0.31415927,
            -0.31415927
        ], [0.21460181, 1.7853982, 0.6073009, 1.3926991, 1.0, 1.0]]

        ground_truth_pdist = [0.6, 0.4]

        shifts_hist = np.zeros((n_symbols, ))
        n_samples = 700
        cnt = 0.0
        for _ in range(n_samples):
            _, flat_perturbations, _, _, _, _ = \
                sd_util.stochastic_coordinate_preprocessor(
                    new_programs, symbol_values_tensor, ops, weights_before,
                    shifts, n_programs, n_symbols, n_param_gates, n_shifts,
                    n_ops, uniform_sampling)

            for s in flat_perturbations[:, -1]:  # See only shift symbols.
                sym = np.where(np.isclose(ground_truth_shifts, s))[0][0]
                shifts_hist[sym] += 1.0
                cnt += 1.0

        shifts_pdist = shifts_hist / cnt
        self.assertAllClose(ground_truth_pdist,
                            shifts_pdist,
                            atol=eps,
                            rtol=eps)
Esempio n. 6
0
    def differentiate_sampled(self, programs, symbol_names, symbol_values,
                              pauli_sums, num_samples, forward_pass_vals, grad):
        """Compute the sampled gradient with cascaded stochastic processes.
        The gradient calculations follows the following steps:
        1. Compute the decomposition of the incoming circuits so that we have
            their generator information (done using cirq in a tf.py_function)
        2. Construct probability distributions & perform stochastic processes
            to select parameter-shift terms.
            - Stochastic generator : sampling on parameter-shifted gates.
            - Stochastic coordinate : sampling on symbols.
            - Stochastic cost : sampling on pauli sums
        3. Sum up terms and reshape for the total gradient that is compatible
            with tensorflow differentiation.
        Args:
            programs: `tf.Tensor` of strings with shape [n_programs] containing
                the string representations of the circuits to be executed.
            symbol_names: `tf.Tensor` of strings with shape [n_symbols], which
                is used to specify the order in which the values in
                `symbol_values` should be placed inside of the circuits in
                `programs`.
            symbol_values: `tf.Tensor` of real numbers with shape
                [n_programs, n_symbols] specifying parameter values to resolve
                into the circuits specified by programs, following the ordering
                dictated by `symbol_names`.
            num_samples: `tf.Tensor` of positive integers representing the
                number of samples per term in each term of pauli_sums used
                during the forward pass.
            pauli_sums : `tf.Tensor` of strings with shape [n_programs, n_ops]
                representing output observables for each program.
            forward_pass_vals : `tf.Tensor` of real numbers for forward pass
                values with the shape of [n_programs, n_ops]
            grad : `tf.Tensor` of real numbers for backpropagated gradient
                values from the upper layer with the shape of
                [n_programs, n_ops]
        Returns:
            A `tf.Tensor` of real numbers for sampled gradients from the above
            samplers with the shape of [n_programs, n_symbols]
        """
        n_symbols = tf.gather(tf.shape(symbol_values), 1)
        n_programs = tf.gather(tf.shape(programs), 0)
        n_ops = tf.gather(tf.shape(pauli_sums), 1)
        n_shifts = 2

        # STEP 1: Generate required inputs for executor by using parsers

        # Deserialize programs and parse the whole parameterized gates
        # new_programs has [n_symbols, n_programs, n_param_gates, n_shifts].
        new_programs, weights, shifts, n_param_gates = \
            parameter_shift_util.parse_programs(
                programs, symbol_names, symbol_values, n_symbols)

        if self.stochastic_generator:
            # Result : [n_symbols, n_programs, n_param_gates=1, n_shifts].
            new_programs, weights, shifts, n_param_gates = \
                sd_util.stochastic_generator_preprocessor(
                    new_programs, weights, shifts, n_programs, n_symbols,
                    n_param_gates, n_shifts, self.uniform_sampling)

        # Reshape & transpose new_programs, weights and shifts to fit into
        # the input format of tensorflow_quantum simulator.
        # [n_symbols, n_param_gates, n_shifts, n_programs]
        new_programs = tf.transpose(new_programs, [0, 2, 3, 1])
        weights = tf.transpose(weights, [0, 2, 3, 1])
        shifts = tf.transpose(shifts, [0, 2, 3, 1])

        if self.stochastic_cost:
            # Result : pauli_sums [n_programs, n_ops] -> [n_programs, n_ops=1]
            pauli_sums, cost_relocator, n_ops = \
                sd_util.stochastic_cost_preprocessor(
                    pauli_sums, n_programs, n_ops, self.uniform_sampling)

        if self.stochastic_coordinate:
            flat_programs, flat_perturbations, flat_ops, flat_num_samples, \
            weights, coordinate_relocator = \
                sd_util.stochastic_coordinate_preprocessor(
                    new_programs, symbol_values, pauli_sums, weights, shifts,
                    n_programs, n_symbols, n_param_gates, n_shifts, n_ops,
                    self.uniform_sampling, num_samples=num_samples)
        else:
            # reshape everything to fit into expectation op correctly
            total_programs = n_programs * n_shifts * n_symbols * n_param_gates
            # tile up and then reshape to order programs correctly
            flat_programs = tf.reshape(new_programs, [total_programs])
            flat_shifts = tf.reshape(shifts, [total_programs])

            # tile up and then reshape to order ops correctly
            n_tile = n_shifts * n_symbols * n_param_gates
            flat_perturbations = tf.concat([
                tf.reshape(
                    tf.tile(tf.expand_dims(symbol_values, 0),
                            tf.stack([n_tile, 1, 1])),
                    [total_programs, n_symbols]),
                tf.expand_dims(flat_shifts, axis=1)
            ],
                                           axis=1)
            flat_ops = tf.reshape(
                tf.tile(tf.expand_dims(pauli_sums, 0),
                        tf.stack([n_tile, 1, 1])), [total_programs, n_ops])
            flat_num_samples = tf.reshape(
                tf.tile(tf.expand_dims(num_samples, 0),
                        tf.stack([n_tile, 1, 1])), [total_programs, n_ops])

        # Append impurity symbol into symbol name
        new_symbol_names = tf.concat([
            symbol_names,
            tf.expand_dims(tf.constant(
                parameter_shift_util._PARAMETER_IMPURITY_NAME),
                           axis=0)
        ],
                                     axis=0)

        # STEP 2: calculate the required expectation values
        expectations = self.expectation_op(flat_programs, new_symbol_names,
                                           flat_perturbations, flat_ops,
                                           flat_num_samples)

        # STEP 3: generate gradients according to the results
        if self.stochastic_coordinate:
            # Transpose to the original shape
            # [n_symbols, n_programs, n_param_gates, n_shifts]
            #
            # coordinate_relocator has [sub_total_programs, n_symbols](=ij)
            # expectations has [sub_total_programs, n_ops](=ik)
            # einsum -> [n_ops, n_symbols, sub_total_programs](=kji)
            expectations = tf.einsum(
                'ij,ik->kji', tf.cast(coordinate_relocator, dtype=tf.float64),
                tf.cast(expectations, dtype=tf.float64))
            # Transpose to [n_symbols, sub_total_programs, n_ops]
            expectations = tf.transpose(expectations, [1, 2, 0])

        # we know the rows are grouped according to which parameter
        # was perturbed, so reshape to reflect that
        grouped_expectations = tf.reshape(
            tf.cast(expectations, dtype=tf.float64),
            [n_symbols, n_shifts * n_programs * n_param_gates, -1])

        # now we can calculate the partial of the circuit output with
        # respect to each perturbed parameter
        def rearrange_expectations(grouped):

            def split_vertically(i):
                return tf.slice(grouped, [i * n_programs, 0],
                                [n_programs, n_ops])

            return tf.map_fn(split_vertically,
                             tf.range(n_param_gates * n_shifts),
                             dtype=tf.float64)

        # reshape so that expectations calculated on different programs are
        # separated by a dimension
        rearranged_expectations = tf.map_fn(rearrange_expectations,
                                            grouped_expectations,
                                            dtype=tf.float64)

        # now we will calculate all of the partial derivatives
        # s: symbol, p: perturbation, c: circuit, o: ops
        partials = tf.einsum(
            'spco,spc->sco', rearranged_expectations,
            tf.cast(tf.reshape(
                weights, [n_symbols, n_param_gates * n_shifts, n_programs]),
                    dtype=tf.float64))

        if self.stochastic_cost:
            # Reshape to the original n_ops shape
            # partials: [n_symbols, n_programs, n_ops=1]
            # cost_relocator: [n_programs, original_n_ops]
            # Result: [n_symbols, n_programs, original_n_ops]
            partials = partials * tf.stop_gradient(
                tf.cast(cost_relocator, dtype=tf.float64))

        # now apply the chain rule
        # cast partials back to float32
        return tf.cast(
            tf.einsum('sco,co -> cs', partials,
                      tf.cast(grad, dtype=tf.float64)), tf.float32)