Example #1
0
def test_find_bad_by_ransac(raw_tmp):
    """Test the RANSAC component of NoisyChannels."""
    # Set a consistent random seed for all RANSAC runs
    RANSAC_RNG = 435656

    # RANSAC identifies channels that go bad together and are highly correlated.
    # Inserting highly correlated signal in channels 0 through 6 at 30 Hz
    raw_tmp._data[0:6, :] = _generate_signal(30, 30, raw_tmp.times)

    # Run different variations of RANSAC on the same data
    test_matrix = {
        # List items represent [matlab_strict, channel_wise, max_chunk_size]
        "by_window": [False, False, None],
        "by_channel": [False, True, None],
        "by_channel_maxchunk": [False, True, 2],
        "by_window_strict": [True, False, None],
        "by_channel_strict": [True, True, None],
    }
    bads = {}
    corr = {}
    for name, args in test_matrix.items():
        nd = NoisyChannels(raw_tmp,
                           do_detrend=False,
                           random_state=RANSAC_RNG,
                           matlab_strict=args[0])
        nd.find_bad_by_ransac(channel_wise=args[1], max_chunk_size=args[2])
        # Save bad channels and RANSAC correlation matrix for later comparison
        bads[name] = nd.bad_by_ransac
        corr[name] = nd._extra_info["bad_by_ransac"]["ransac_correlations"]

    # Test whether all methods detected bad channels properly
    assert bads["by_window"] == raw_tmp.ch_names[0:6]
    assert bads["by_channel"] == raw_tmp.ch_names[0:6]
    assert bads["by_channel_maxchunk"] == raw_tmp.ch_names[0:6]
    assert bads["by_window_strict"] == raw_tmp.ch_names[0:6]
    assert bads["by_channel_strict"] == raw_tmp.ch_names[0:6]

    # Make sure non-strict correlation matrices all match
    assert np.allclose(corr["by_window"], corr["by_channel"])
    assert np.allclose(corr["by_window"], corr["by_channel_maxchunk"])

    # Make sure MATLAB-strict correlation matrices match
    assert np.allclose(corr["by_window_strict"], corr["by_channel_strict"])

    # Make sure strict and non-strict matrices differ
    assert not np.allclose(corr["by_window"], corr["by_window_strict"])

    # Ensure that RANSAC doesn't change random state if in MATLAB-strict mode
    rng = RandomState(RANSAC_RNG)
    init_state = rng.get_state()[2]
    nd = NoisyChannels(raw_tmp,
                       do_detrend=False,
                       random_state=rng,
                       matlab_strict=True)
    nd.find_bad_by_ransac()
    assert rng.get_state()[2] == init_state
    def test_RandomRectangularPattern_ca_3ch_postit(self):
        rso = RandomState(1)
        state_tuple = rso.get_state()
        t = image_triggers.RandomRectangularPattern(
            3,
            3,
            3,
            color_algorithm='channel_assign',
            color_options={'cval': [255, 254, 253]},
            pattern_style='postit',
            random_state_obj=rso)
        actual_img = t.get_data()
        actual_mask = t.get_mask()

        # reset the random state and generate the pattern in the same manner
        rso.set_state(state_tuple)
        per_chan_expected_img = rso.choice(2, 3 * 3).reshape(
            (3, 3)).astype(bool)
        expected_img = np.zeros((3, 3, 3))
        expected_img[:, :, 0] = per_chan_expected_img * 255  # the color
        expected_img[:, :, 1] = per_chan_expected_img * 254  # the color
        expected_img[:, :, 2] = per_chan_expected_img * 253  # the color
        expected_mask = np.ones((3, 3)).astype(bool)
        self.assertTrue(np.array_equal(actual_img, expected_img))
        self.assertTrue(np.array_equal(actual_mask, expected_mask))
Example #3
0
def make_trial():
    try:
        # t0 = time()
        # args:
        #   lenscale
        #   nPassiveObs
        #   rngstate

        params = request.json  # works when ajax request contentType specified as "applications/json"
        # unpack random number generator
        rng = RandomState()
        rngstate = unpack_rngstate(params['rngstate'])
        rng.set_state(rngstate)

        lenscale = float(params['lenscale'])
        nPassiveObs = int(params['nPassiveObs'])

        thisTri = boe.make_trial(nPassiveObs, DOMAIN, lenscale, SIGVAR, NOISEVAR2, XSAM_BOUNDS, rng)

        resp = {'sample': thisTri['sample'].tolist(),
                'xObs': thisTri['xObs'].flatten().tolist(),
                'yObs': thisTri['yObs'].tolist(),
                'iObs': thisTri['iObs'].tolist(),
                'rngstate': pack_rngstate(rng.get_state())}

    except:
        raise ExperimentError('improper_inputs')  # i don't like returning HTML to JSON requests...  maybe should change this

    return jsonify(**resp)
Example #4
0
def init_experiment():
    if not request.args.has_key('condition'):
        raise ExperimentError('improper_inputs')  # i don't like returning HTML to JSON requests...  maybe should change this

    condition = int(request.args['condition'])
    counterbalance = int(request.args['counterbalance'])

    ## END FREE VARS
    lenscale = LENSCALEPOOL[condition]
    rngseed = RNGSEEDPOOL[counterbalance]
    rng = RandomState(rngseed)

    experParams = {
                   'nTrial': NTRIAL,
                   'nPassivePool': NPASSIVEPOOL,
                   'nActivePool': NACTIVEPOOL,
                   'rng': rng
                   }

    subParams = boe.make_experiment(**experParams)

    # bundle response to send
    resp = {}
    for f in subParams:
            try:
                resp[f] = subParams[f].tolist()
            except:
                resp[f] = subParams[f]

    for f in experParams:
        if f is not 'rng':  # don't pass the random number generator
            try:  # convet numpy array to list if possible
                resp[f] = experParams[f].tolist()
            except:
                resp[f] = experParams[f]

    resp['itrial'] = -1
    resp['isam3'] = -1
    resp['nTrial'] = NTRIAL
    resp['costToDrill'] = COSTTODRILL
    resp['costToSample'] = COSTTOSAMPLE
    resp['initscore'] = INITSCORE
    resp['lenscale'] = lenscale
    resp['sigvar'] = SIGVAR
    resp['domainbounds'] = DOMAINBOUNDS
    resp['domainres'] = DOMAINRES
    resp['edgebuf'] = EDGEBUF

    resp['rngstate'] = pack_rngstate(rng.get_state())

    return jsonify(**resp)
    def test_iter_equal_pre_gen(self, dataset_cls):
        """Given the same random-number-generator state,
        both pre-generated and lazily generated data-set should result
        in the same samples."""
        size = self.PRE_GEN_DATASET_DEFAULT_SIZE
        seed = self.DATASETS_DEFAULT_SEED

        pre_generated = dataset_cls.pre_generate(size, seed)
        lazily_generated = dataset_cls()
        rnd = RandomState(seed)
        lazily_generated.random_state = rnd.get_state()

        for i, sample in enumerate(lazily_generated):
            if i >= size - 1:
                break
            print(self.DATASET_CLS)
            assert sample == pre_generated[i], \
                f'Samples at iteration {i} don\'t match!'
Example #6
0
def test_unequal_reset():
    def mean_diff(*args):
        return args[0].mean() - args[1].mean()

    rs = RandomState(0)
    x = rs.standard_normal(800)
    y = rs.standard_normal(200)
    orig_state = rs.get_state()
    bs = IndependentSamplesBootstrap(x, y, random_state=rs)
    variance = bs.var(mean_diff)
    assert variance > 0
    bs.reset()
    state = bs.get_state()
    assert_equal(state[1], orig_state[1])

    bs = IndependentSamplesBootstrap(x, y)
    bs.seed(0)
    orig_state = bs.get_state()
    bs.var(mean_diff)
    bs.reset(use_seed=True)
    state = bs.get_state()
    assert_equal(state[1], orig_state[1])
Example #7
0
def test_unequal_reset():
    def mean_diff(*args):
        return args[0].mean() - args[1].mean()

    rs = RandomState(0)
    x = rs.randn(800)
    y = rs.randn(200)
    orig_state = rs.get_state()
    bs = IndependentSamplesBootstrap(x, y, random_state=rs)
    variance = bs.var(mean_diff)
    assert variance > 0
    bs.reset()
    state = bs.get_state()
    assert_equal(state[1], orig_state[1])

    bs = IndependentSamplesBootstrap(x, y)
    bs.seed(0)
    orig_state = bs.get_state()
    bs.var(mean_diff)
    bs.reset(use_seed=True)
    state = bs.get_state()
    assert_equal(state[1], orig_state[1])
Example #8
0
class NumpyRandomGenerator(Generator):
    """
        Generator wrapping any numpy.Random method.
    """
    def __init__(self, method, seed, **numpy_parameters):
        """Initialise a random number generator

        :param method: string: must be a valid numpy.Randomstate method that
            accept the "size" parameter

        :param numpy_parameters: dict, see descriptions below
        :param seed: int, seed of the generator
        :return: create a random number generator of type "gen_type", with its parameters and seeded.
        """
        Generator.__init__(self)
        self.method = method
        self.numpy_parameters = numpy_parameters
        self.state = RandomState(seed)
        self.numpy_method = getattr(self.state, method)

    def generate(self, size):
        all_params = merge_2_dicts({"size": size}, self.numpy_parameters)
        return self.numpy_method(**all_params)

    def description(self):
        return {
            "type": "NumpyRandomGenerator",
            "method": self.method,
            "numpy_parameters": self.numpy_parameters
        }

    def save_to(self, output_file):

        logging.info("saving generator to {}".format(output_file))

        # saving the numpy RandomState instance, converting the numpy array
        # to enable json serialization
        np_state = self.state.get_state()
        state = {
            "method":
            self.method,
            "numpy_parameters":
            self.numpy_parameters,
            "numpy_state": (np_state[0], np_state[1].tolist(), np_state[2],
                            np_state[3], np_state[4])
        }
        with open(output_file, "w") as outf:
            json.dump(state, outf, indent=4)

    @staticmethod
    def load_from(input_file):

        logging.info("loading numpy generator from {}".format(input_file))

        with open(input_file, "r") as inf:
            json_payload = json.load(inf)

            # Initializing the generator with an incorrect seed just to make
            # the constructor happy, then setting the state
            gen = NumpyRandomGenerator(method=json_payload["method"],
                                       seed=1234,
                                       **json_payload["numpy_parameters"])

            # retrieving the numpy state + converting list to np.array as needed
            state_raw_ = json_payload["numpy_state"]
            np_state = (state_raw_[0], np.array(state_raw_[1]), state_raw_[2],
                        state_raw_[3], state_raw_[4])

            gen.state = np.random.RandomState(seed=1234)
            gen.state.set_state(np_state)
            return gen
    })

    # setup data generation
    # Setup the files based on user inputs
    data_folder = os.path.abspath(a.data_folder)
    toplevel_folder = a.experiment_path

    # check if the data_folder has the cifar10 data, if not download it
    data_folder = cifar10.download_and_extract(data_folder)

    train_output_csv_file = 'train_cifar10.csv'
    test_output_csv_file = 'test_cifar10.csv'

    MASTER_SEED = 1234
    master_random_state_object = RandomState(MASTER_SEED)
    start_state = master_random_state_object.get_state()

    # define a configuration which triggers data by applying the Gotham Instagram Filter
    datagen_per_class_trigger_frac = 0.25
    gotham_trigger_cfg = \
        tdc.XFormMergePipelineConfig(
            # setup the list of possible triggers that will be inserted into the CIFAR10 data.
            trigger_list=[],
            # tell the trigger inserter the probability of sampling each type of trigger specified in the trigger
            # list.  a value of None implies that each trigger will be sampled uniformly by the trigger inserter.
            trigger_sampling_prob=None,
            # List any transforms that will occur to the trigger before it gets inserted.  In this case, we do none.
            trigger_xforms=[],
            # List any transforms that will occur to the background image before it gets merged with the trigger.
            trigger_bg_xforms=[tinstx.GothamFilterXForm()],
            # List how we merge the trigger and the background.  Because we don't insert a point trigger,
Example #10
0
class IIDBootstrap(object):
    """
    Bootstrap using uniform resampling

    Parameters
    ----------
    args
        Positional arguments to bootstrap
    kwargs
        Keyword arguments to bootstrap

    Attributes
    ----------
    index : array
        The current index of the bootstrap
    data : tuple
        Two-element tuple with the pos_data in the first position and kw_data
        in the second (pos_data, kw_data)
    pos_data : tuple
        Tuple containing the positional arguments (in the order entered)
    kw_data : dict
        Dictionary containing the keyword arguments
    random_state : RandomState
        RandomState instance used by bootstrap

    Notes
    -----
    Supports numpy arrays and pandas Series and DataFrames.  Data returned has
    the same type as the input date.

    Data entered using keyword arguments is directly accessibly as an
    attribute.

    Examples
    --------
    Data can be accessed in a number of ways.  Positional data is retained in
    the same order as it was entered when the bootstrap was initialized.
    Keyword data is available both as an attribute or using a dictionary syntax
    on kw_data.

    >>> from arch.bootstrap import IIDBootstrap
    >>> from numpy.random import standard_normal
    >>> y = standard_normal((500, 1))
    >>> x = standard_normal((500,2))
    >>> z = standard_normal(500)
    >>> bs = IIDBootstrap(x, y=y, z=z)
    >>> for data in bs.bootstrap(100):
    ...     bs_x = data[0][0]
    ...     bs_y = data[1]['y']
    ...     bs_z = bs.z
    """
    def __init__(self, *args, **kwargs):
        self.random_state = RandomState()
        self._initial_state = self.random_state.get_state()
        self._args = args
        self._kwargs = kwargs
        if args:
            self._num_items = len(args[0])
        elif kwargs:
            key = list(kwargs.keys())[0]
            self._num_items = len(kwargs[key])

        all_args = list(args)
        all_args.extend([v for v in itervalues(kwargs)])

        for arg in all_args:
            if len(arg) != self._num_items:
                raise ValueError("All inputs must have the same number of "
                                 "elements in axis 0")
        self._index = np.arange(self._num_items)

        self._parameters = []
        self._seed = None
        self.pos_data = args
        self.kw_data = kwargs
        self.data = (args, kwargs)

        self._base = None
        self._results = None
        self._studentized_results = None
        self._last_func = None
        self._name = 'IID Bootstrap'
        for key, value in iteritems(kwargs):
            attr = getattr(self, key, None)
            if attr is None:
                self.__setattr__(key, value)
            else:
                raise ValueError(key + ' is a reserved name')

    def __str__(self):
        repr = self._name
        repr += '(no. pos. inputs: ' + str(len(self.pos_data))
        repr += ', no. keyword inputs: ' + str(len(self.kw_data)) + ')'
        return repr

    def __repr__(self):
        return self.__str__()[:-1] + ', ID: ' + hex(id(self)) + ')'

    def _repr_html(self):
        html = '<strong>' + self._name + '</strong>('
        html += '<strong>no. pos. inputs</strong>: ' + str(len(self.pos_data))
        html += ', <strong>no. keyword inputs</strong>: ' + \
                str(len(self.kw_data))
        html += ', <strong>ID</strong>: ' + hex(id(self)) + ')'
        return html

    @property
    def index(self):
        """
        Returns the current index of the bootstrap
        """
        return self._index

    def get_state(self):
        """
        Gets the state of the bootstrap's random number generator

        Returns
        -------
        state : RandomState state vector
            Array containing the state
        """
        return self.random_state.get_state()

    def set_state(self, state):
        """
        Sets the state of the bootstrap's random number generator

        Parameters
        ----------
        state : RandomState state vector
            Array containing the state
        """

        return self.random_state.set_state(state)

    def seed(self, value):
        """
        Seeds the bootstrap's random number generator

        Parameters
        ----------
        value : int
            Integer to use as the seed
        """
        self._seed = value
        self.random_state.seed(value)
        return None

    def reset(self, use_seed=True):
        """
        Resets the bootstrap to either its initial state or the last seed.

        Parameters
        ----------
        use_seed : bool, optional
            Flag indicating whether to use the last seed if provided.  If
            False or if no seed has been set, the bootstrap will be reset
            to the initial state.  Default is True
        """
        self._index = np.arange(self._num_items)
        self._resample()
        self.random_state.set_state(self._initial_state)
        if use_seed and self._seed is not None:
            self.seed(self._seed)
        return None

    def bootstrap(self, reps):
        """
        Iterator for use when bootstrapping

        Parameters
        ----------
        reps : int
            Number of bootstrap replications

        Example
        -------
        The key steps are problem dependent and so this example shows the use
        as an iterator that does not produce any output

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> bs = IIDBootstrap(np.arange(100), x=np.random.randn(100))
        >>> for posdata, kwdata in bs.bootstrap(1000):
        ...     # Do something with the positional data and/or keyword data
        ...     pass

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        Notes
        -----
        The iterator returns a tuple containing the data entered in positional
        arguments as a tuple and the data entered using keywords as a
        dictionary
        """
        for _ in range(reps):
            indices = np.asarray(self.update_indices())
            self._index = indices
            yield self._resample()

    def conf_int(self,
                 func,
                 reps=1000,
                 method='basic',
                 size=0.95,
                 tail='two',
                 extra_kwargs=None,
                 reuse=False,
                 sampling='nonparametric',
                 std_err_func=None,
                 studentize_reps=1000):
        """
        Parameters
        ----------
        func : callable
            Function the computes parameter values.  See Notes for requirements
        reps : int, optional
            Number of bootstrap replications
        method : string, optional
            One of 'basic', 'percentile', 'studentized', 'norm' (identical to
            'var', 'cov'), 'bc' (identical to 'debiased', 'bias-corrected'), or
            'bca'
        size : float, optional
            Coverage of confidence interval
        tail : string, optional
            One of 'two', 'upper' or 'lower'.
        reuse : bool, optional
            Flag indicating whether to reuse previously computed bootstrap
            results.  This allows alternative methods to be compared without
            rerunning the bootstrap simulation.  Reuse is ignored if reps is
            not the same across multiple runs, func changes across calls, or
            method is 'studentized'.
        sampling : string, optional
            Type of sampling to use: 'nonparametric', 'semi-parametric' (or
            'semi') or 'parametric'.  The default is 'nonparametric'.  See
            notes about the changes to func required when using 'semi' or
            'parametric'.
        extra_kwargs : dict, optional
            Extra keyword arguments to use when calling func and std_err_func,
            when appropriate
        std_err_func : callable, optional
            Function to use when standardizing estimated parameters when using
            the studentized bootstrap.  Providing an analytical function
            eliminates the need for a nested bootstrap
        studentize_reps : int, optional
            Number of bootstraps to use in the innter component when using the
            studentized bootstrap.  Ignored when ``std_err_func`` is provided

        Returns
        -------
        intervals : 2-d array
            Computed confidence interval.  Row 0 contains the lower bounds, and
            row 1 contains the upper bounds.  Each column corresponds to a
            parameter. When tail is 'lower', all upper bounds are inf.
            Similarly, 'upper' sets all lower bounds to -inf.

        Examples
        --------
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(0)
        >>> y = np.random.randn(1000, 2)
        >>> from arch.bootstrap import IIDBootstrap
        >>> bs = IIDBootstrap(y)
        >>> ci = bs.conf_int(func, 1000)

        Notes
        -----
        When there are no extra keyword arguments, the function is called

        .. code:: python

            func(*args, **kwargs)

        where args and kwargs are the bootstrap version of the data provided
        when setting up the bootstrap.  When extra keyword arguments are used,
        these are appended to kwargs before calling func.

        The standard error function, if provided, must return a vector of
        parameter standard errors and is called

        .. code:: python

            std_err_func(params, *args, **kwargs)

        where ``params`` is the vector of estimated parameters using the same
        bootstrap data as in args and kwargs.

        The bootstraps are:

        * 'basic' - Basic confidence using the estimated parameter and
          difference between the estimated parameter and the bootstrap
          parameters
        * 'percentile' - Direct use of bootstrap percentiles
        * 'norm' - Makes use of normal approximation and bootstrap covariance
          estimator
        * 'studentized' - Uses either a standard error function or a nested
          bootstrap to estimate percentiles and the bootstrap covariance for
          scale
        * 'bc' - Bias corrected using estimate bootstrap bias correction
        * 'bca' - Bias corrected and accelerated, adding acceleration parameter
          to 'bc' method

        """
        studentized = 'studentized'
        if not 0.0 < size < 1.0:
            raise ValueError('size must be strictly between 0 and 1')
        tail = tail.lower()
        if tail not in ('two', 'lower', 'upper'):
            raise ValueError('tail must be one of two-sided, lower or upper')
        studentize_reps = studentize_reps if method == studentized else 0

        _reuse = False
        if reuse:
            # check conditions for reuse
            _reuse = (self._results is not None and len(self._results) == reps
                      and method != studentized and self._last_func is func)

        if not _reuse:
            if reuse:
                import warnings

                warn = 'The conditions to reuse the previous bootstrap has ' \
                       'not been satisfied. A new bootstrap will be used.'
                warnings.warn(warn, RuntimeWarning)
            self._construct_bootstrap_estimates(
                func,
                reps,
                extra_kwargs,
                std_err_func=std_err_func,
                studentize_reps=studentize_reps,  # noqa
                sampling=sampling)

        base, results = self._base, self._results
        studentized_results = self._studentized_results

        std_err = []
        if method in ('norm', 'var', 'cov', studentized):
            errors = results - results.mean(axis=0)
            std_err = np.sqrt(np.diag(errors.T.dot(errors) / reps))

        if tail == 'two':
            alpha = (1.0 - size) / 2
        else:
            alpha = (1.0 - size)

        percentiles = [alpha, 1.0 - alpha]
        norm_quantiles = stats.norm.ppf(percentiles)

        if method in ('norm', 'var', 'cov'):
            lower = base + norm_quantiles[0] * std_err
            upper = base + norm_quantiles[1] * std_err

        elif method in ('percentile', 'basic', studentized, 'debiased', 'bc',
                        'bias-corrected', 'bca'):
            values = results
            if method == studentized:
                # studentized uses studentized parameter estimates
                values = studentized_results

            if method in ('debiased', 'bc', 'bias-corrected', 'bca'):
                # bias corrected uses modified percentiles, but is
                # otherwise identical to the percentile method
                p = (results < base).mean(axis=0)
                b = stats.norm.ppf(p)
                b = b[:, None]
                if method == 'bca':
                    nobs = self._num_items
                    jk_params = _loo_jackknife(func, nobs, self._args,
                                               self._kwargs)
                    u = (nobs - 1) * (jk_params - base)
                    numer = np.sum(u**3, 0)
                    denom = 6 * (np.sum(u**2, 0)**(3.0 / 2.0))
                    small = denom < (np.abs(numer) * np.finfo(np.float64).eps)
                    if small.any():
                        message = 'Jackknife variance estimate {jk_var} is ' \
                                  'too small to use BCa'
                        raise RuntimeError(message.format(jk_var=denom))
                    a = numer / denom
                    a = a[:, None]
                else:
                    a = 0.0

                percentiles = stats.norm.cdf(b + (b + norm_quantiles) /
                                             (1.0 - a * (b + norm_quantiles)))
                percentiles = list(100 * percentiles)
            else:
                percentiles = [100 * p for p in percentiles]  # Rescale

            if method not in ('bc', 'debiased', 'bias-corrected', 'bca'):
                ci = np.asarray(np.percentile(values, percentiles, axis=0))
                lower = ci[0, :]
                upper = ci[1, :]
            else:
                k = values.shape[1]
                lower = np.zeros(k)
                upper = np.zeros(k)
                for i in range(k):
                    lower[i], upper[i] = np.percentile(values[:, i],
                                                       list(percentiles[i]))

            # Basic and studentized use the lower empirical quantile to
            # compute upper and vice versa.  Bias corrected and percentile use
            # upper to estimate the upper, and lower to estimate the lower
            if method == 'basic':
                lower_copy = lower + 0.0
                lower = 2.0 * base - upper
                upper = 2.0 * base - lower_copy
            elif method == studentized:
                lower_copy = lower + 0.0
                lower = base - upper * std_err
                upper = base - lower_copy * std_err

        else:
            raise ValueError('Unknown method')

        if tail == 'lower':
            upper = np.zeros_like(base)
            upper.fill(np.inf)
        elif tail == 'upper':
            lower = np.zeros_like(base)
            lower.fill(-1 * np.inf)

        return np.vstack((lower, upper))

    def clone(self, *args, **kwargs):
        """
        Clones the bootstrap using different data.

        Parameters
        ----------
        args
            Positional arguments to bootstrap
        kwargs
            Keyword arguments to bootstrap

        Returns
        -------
        bs
            Bootstrap instance
        """
        pos_arguments = copy.deepcopy(self._parameters)
        pos_arguments.extend(args)
        bs = self.__class__(*pos_arguments, **kwargs)
        if self._seed is not None:
            bs.seed(self._seed)
        return bs

    def apply(self, func, reps=1000, extra_kwargs=None):
        """
        Applies a function to bootstrap replicated data

        Parameters
        ----------
        func : callable
            Function the computes parameter values.  See Notes for requirements
        reps : int, optional
            Number of bootstrap replications
        extra_kwargs : dict, optional
            Extra keyword arguments to use when calling func.  Must not
            conflict with keyword arguments used to initialize bootstrap

        Returns
        -------
        results : array
            reps by nparam array of computed function values where each row
            corresponds to a bootstrap iteration

        Notes
        -----
        When there are no extra keyword arguments, the function is called

        .. code:: python

            func(params, *args, **kwargs)

        where args and kwargs are the bootstrap version of the data provided
        when setting up the bootstrap.  When extra keyword arguments are used,
        these are appended to kwargs before calling func

        Examples
        --------
        >>> import numpy as np
        >>> x = np.random.randn(1000,2)
        >>> from arch.bootstrap import IIDBootstrap
        >>> bs = IIDBootstrap(x)
        >>> def func(y):
        ...     return y.mean(0)
        >>> results = bs.apply(func, 100)
        """
        kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs)
        base = func(*self._args, **kwargs)
        try:
            num_params = base.shape[0]
        except:
            num_params = 1
        results = np.zeros((reps, num_params))
        count = 0
        for pos_data, kw_data in self.bootstrap(reps):
            kwargs = _add_extra_kwargs(kw_data, extra_kwargs)
            results[count] = func(*pos_data, **kwargs)
            count += 1
        return results

    def _construct_bootstrap_estimates(self,
                                       func,
                                       reps,
                                       extra_kwargs=None,
                                       std_err_func=None,
                                       studentize_reps=0,
                                       sampling='nonparametric'):
        # Private, more complicated version of apply
        self._last_func = func
        semi = parametric = False
        if sampling == 'parametric':
            parametric = True
        elif sampling == 'semiparametric':
            semi = True

        if extra_kwargs is not None:
            if any(k in self._kwargs for k in extra_kwargs):
                raise ValueError('extra_kwargs contains keys used for variable'
                                 ' names in the bootstrap')
        kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs)
        base = func(*self._args, **kwargs)

        num_params = 1 if np.isscalar(base) else base.shape[0]
        results = np.zeros((reps, num_params))
        studentized_results = np.zeros((reps, num_params))

        count = 0
        for pos_data, kw_data in self.bootstrap(reps):
            kwargs = _add_extra_kwargs(kw_data, extra_kwargs)
            if parametric:
                kwargs['state'] = self.random_state
                kwargs['params'] = base
            elif semi:
                kwargs['params'] = base
            results[count] = func(*pos_data, **kwargs)
            if std_err_func is not None:
                std_err = std_err_func(results[count], *pos_data, **kwargs)
                studentized_results[count] = (results[count] - base) / std_err
            elif studentize_reps > 0:
                # Need new bootstrap of same type
                nested_bs = self.clone(*pos_data, **kw_data)
                # Set the seed to ensure reproducability
                seed = self.random_state.randint(2**31 - 1)
                nested_bs.seed(seed)
                cov = nested_bs.cov(func,
                                    studentize_reps,
                                    extra_kwargs=extra_kwargs)
                std_err = np.sqrt(np.diag(cov))
                studentized_results[count] = (results[count] - base) / std_err
            count += 1

        self._base = np.asarray(base)
        self._results = np.asarray(results)
        self._studentized_results = np.asarray(studentized_results)

    def cov(self, func, reps=1000, recenter=True, extra_kwargs=None):
        """
        Compute parameter covariance using bootstrap

        Parameters
        ----------
        func : callable
            Callable function that returns the statistic of interest as a
            1-d array
        reps : int, optional
            Number of bootstrap replications
        recenter : bool, optional
            Whether to center the bootstrap variance estimator on the average
            of the bootstrap samples (True) or to center on the original sample
            estimate (False).  Default is True.
        extra_kwargs: dict, optional
            Dictionary of extra keyword arguments to pass to func

        Returns
        -------
        cov: array
            Bootstrap covariance estimator

        Notes
        -----
        func must have the signature

        .. code:: python

            func(params, *args, **kwargs)

        where params are a 1-dimensional array, and `*args` and `**kwargs` are
        data used in the the bootstrap.  The first argument, params, will be
        none when called using the original data, and will contain the estimate
        computed using the original data in bootstrap replications.  This
        parameter is passed to allow parametric bootstrap simulation.

        Example
        -------
        Bootstrap covariance of the mean

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(axis=0)
        >>> y = np.random.randn(1000, 3)
        >>> bs = IIDBootstrap(y)
        >>> cov = bs.cov(func, 1000)

        Bootstrap covariance using a function that takes additional input

        >>> def func(x, stat='mean'):
        ...     if stat=='mean':
        ...         return x.mean(axis=0)
        ...     elif stat=='var':
        ...         return x.var(axis=0)
        >>> cov = bs.cov(func, 1000, extra_kwargs={'stat':'var'})

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        """
        self._construct_bootstrap_estimates(func, reps, extra_kwargs)
        base, results = self._base, self._results

        if recenter:
            errors = results - np.mean(results, 0)
        else:
            errors = results - base

        return errors.T.dot(errors) / reps

    def var(self, func, reps=1000, recenter=True, extra_kwargs=None):
        """
        Compute parameter variance using bootstrap

        Parameters
        ----------
        func : callable
            Callable function that returns the statistic of interest as a
            1-d array
        reps : int, optional
            Number of bootstrap replications
        recenter : bool, optional
            Whether to center the bootstrap variance estimator on the average
            of the bootstrap samples (True) or to center on the original sample
            estimate (False).  Default is True.
        extra_kwargs: dict, optional
            Dictionary of extra keyword arguments to pass to func

        Returns
        -------
        var : 1-d array
            Bootstrap variance estimator

        Notes
        -----
        func must have the signature

        .. code:: python

            func(params, *args, **kwargs)

        where params are a 1-dimensional array, and `*args` and `**kwargs` are
        data used in the the bootstrap.  The first argument, params, will be
        none when called using the original data, and will contain the estimate
        computed using the original data in bootstrap replications.  This
        parameter is passed to allow parametric bootstrap simulation.

        Example
        -------
        Bootstrap covariance of the mean

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(axis=0)
        >>> y = np.random.randn(1000, 3)
        >>> bs = IIDBootstrap(y)
        >>> variances = bs.var(func, 1000)

        Bootstrap covariance using a function that takes additional input

        >>> def func(x, stat='mean'):
        ...     if stat=='mean':
        ...         return x.mean(axis=0)
        ...     elif stat=='var':
        ...         return x.var(axis=0)
        >>> variances = bs.var(func, 1000, extra_kwargs={'stat': 'var'})

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        """
        self._construct_bootstrap_estimates(func, reps, extra_kwargs)
        base, results = self._base, self._results

        if recenter:
            errors = results - np.mean(results, 0)
        else:
            errors = results - base

        return (errors**2).sum(0) / reps

    def update_indices(self):
        """
        Update indices for the next iteration of the bootstrap.  This must
        be overridden when creating new bootstraps.
        """
        return self.random_state.randint(self._num_items, size=self._num_items)

    def _resample(self):
        """
        Resample all data using the values in _index
        """
        indices = self._index
        pos_data = []
        for values in self._args:
            if isinstance(values, (pd.Series, pd.DataFrame)):
                pos_data.append(values.iloc[indices])
            else:
                pos_data.append(values[indices])
        named_data = {}
        for key, values in iteritems(self._kwargs):
            if isinstance(values, (pd.Series, pd.DataFrame)):
                named_data[key] = values.iloc[indices]
            else:
                named_data[key] = values[indices]
            setattr(self, key, named_data[key])

        self.pos_data = pos_data
        self.kw_data = named_data
        self.data = (pos_data, named_data)
        return self.data
Example #11
0
class Evolution:
    # pylint: disable=too-many-instance-attributes
    # noinspection PyUnresolvedReferences
    """
    Class that executes genetic search.
    :param num_populations: (int) number of populations (default 1)
    :param population_size: (int) size of the population
    :param genotype_size: (int) size of the genotype vector
    :param evaluation_function: (func) function to evaluate genotype performance.
        It should take as inputthe entire population genotype (matrix) and return
        an array with the performances
    :param fitness_normalization_mode: (str) method to normalize fitness values
        (fitness-proportionate, rank-based or sigma scaling)
    :param selection_mode: (str) method to select parents for reproduction (RWS or SUS)
    :param reproduce_from_elite: (bool) whether the reproduction comes from elite 
        or remaining agents in the population
    :param reproduction_mode: (str) method to reproduce genetic algorithm or hill climbing
    :param mutation_variance: (float) variance of gaussian mutation rate
    :param folder_path: (string) path of the folder where to save the checkpoints
    :param search_constraint: (list of bool) flag whether to clip a specific site in
        a genotype (default to all True)
    :param reevaluate: (bool) whether to re-evaluate the individual if it's retained
        in the new generation (used only in hill-climbing)
    :param max_generation: (int) maximum generations to evolve (not used if
        termination_function is provided)
    :param termination_function: (func) function to check if search should terminate
        (it accept the Evolution instance, default to None)
    :param elitist_fraction: (float) proportion of new population that will be made of
        best unmodified parents (only relevant for genetic algorithm)
    :param mating_fraction: (float) proportion of population that will be made of children
        (in Beer this is equal to 1. - elitist_fraction) (only relevant for genetic algorithm)
    :param crossover_probability: (float) probability that crossover will occur
        (only relevant for genetic algorithm)
    :param crossover_mode: (str) the way to perform crossover (UNIFORM, 1-POINT, 2-POINT, ...)
        (only relevant for genetic algorithm)
    :param crossover_points: (list of int) a list that specifies the indices of where
        to cut during crossover (only relevant for genetic algorithm)
    :param checkpoint_interval: (int) every how many generations should the population
        be saved and results logged
    :param max_expected_offspring: (float) number of offspring to be allocated to the
        best individual, best between 1 and 2
    """
    
    population_size: int
    genotype_size: int
    evaluation_function: Callable
    num_populations: int = 1
    performance_objective: Union[str,float] = 'MAX' # 'MIN', 'ABS_MAX', float value
    fitness_normalization_mode: str = 'FPS' # 'NONE', 'FPS', 'RANK', 'SIGMA'
    selection_mode: str = 'RWS' # 'UNIFORM', 'RWS', 'SUS'
    reproduce_from_elite: bool = False
    reproduction_mode: str = 'GENETIC_ALGORITHM' # 'HILL_CLIMBING', 'GENETIC_ALGORITHM'
    mutation_variance: float = DEFAULT_MUTATION_VARIANCE
    max_generation: int = 100
    termination_function: Callable = None
    checkpoint_interval: int = DEFAULT_CHECKPOINT_INTERVAL
    crossover_probability: float = DEFAULT_CROSSOVER_PROB
    crossover_points: List[int] = None
    folder_path: str = None
    elitist_fraction: float = None
    mating_fraction: float = None
    n_elite: int = None
    n_mating: int = None
    n_fillup: int = None
    crossover_mode: str = 'UNIFORM'
    search_constraint: np.ndarray = None  # this will be converted to all True by default in __post_init__
    reevaluate: bool = True # only used in hill-climbing
    max_expected_offspring: float = DEFAULT_MAX_EXPECTED_OFFSPRING

    random_seed: int = 0
    random_state: RandomState = None
    pop_eval_random_seed: int = None  # initialized at every generation

    # other field (no need to define them outside)
    generation: int = 0  # the current generation number
    population: np.ndarray = None  # the list of population genotypes (sorted by performance)
    population_unsorted: np.ndarray = None  # the list of population genotypes (before sorting)
    # (will be initialized in __post_init__)
    performances: np.ndarray = None  # performances of the genotypes
    fitnesses: np.ndarray = None  # fitnesses of the genotypes

    population_sorted_indexes: np.ndarray = None  
    # keep track of indexes in sorted population
    # population_sorted_indexes[0] is the index of the agent with best performance
    # in the unsorted population

    # collect average, best and worst performances across generations
    avg_performances: List[List[float]] = field(default_factory=list)
    best_performances: List[List[float]] = field(default_factory=list)
    worst_performances: List[List[float]] = field(default_factory=list)

    timeit: bool = False

    def __post_init__(self):

        assert self.num_populations > 0, "Number of populations should be greater than zero"

        assert self.population_size % 4 == 0, "Population size must be divisible by 4"
        # otherwise n_elite + n_mating may be greater than population_size    

        self.sqrt_mutation_variance = np.sqrt(self.mutation_variance)

        if self.random_state is None:
            self.random_state = RandomState(self.random_seed)

        self.loaded_from_file = all(
            x is not None for x in 
            [self.population, self.performances, self.fitnesses]
        )

        # create initial population if not provided
        if self.population is None:
            # create a set of random genotypes
            self.population = self.random_state.uniform(
                MIN_SEARCH_VALUE, MAX_SEARCH_VALUE,
                [self.num_populations, self.population_size, self.genotype_size]
            )

        if self.search_constraint is None:
            self.search_constraint = np.array([True] * self.genotype_size)

        self.file_num_zfill = int(np.ceil(np.log10(self.max_generation + 1))) \
            if self.max_generation \
            else 1 if self.max_generation == 0 \
            else FILE_NUM_ZFILL_DEFAULT

        # conver performance_objective to float if it is a string with a number
        f = utils.get_float(self.performance_objective)
        if f is not None:
            self.performance_objective = f

        self.timing = Timing(self.timeit)

        self.validate_params()
        self.init_reproduction_parameters()

    @staticmethod
    def get_random_genotype(rando_state, gen_size):
        return rando_state.uniform(MIN_SEARCH_VALUE, MAX_SEARCH_VALUE, gen_size)

    def init_reproduction_parameters(self):
        # self.n_mating: number of new agents return by select_mating_pool()
        if self.reproduction_mode == 'GENETIC_ALGORITHM':
            # self.n_elite: number of best agents to preserve (only used in genetic algorithm)
            # self.n_fillup: agents to be randomly generated
            self.n_elite = int(
                np.floor(self.population_size * self.elitist_fraction + 0.5) # at least one
            )  # children from elite group
            self.n_mating = int(np.floor(
                self.population_size * self.mating_fraction + 0.5 # at least one
            ))  # children from mating population
            self.n_fillup = self.population_size - (self.n_elite + self.n_mating)  # children from random fillup
            assert all(x >= 0 for x in [self.n_elite, self.n_mating, self.n_fillup])
            assert self.n_elite + self.n_mating + self.n_fillup == self.population_size
        else:  # 'HILL_CLIMBING'
            self.n_mating = self.population_size

    def validate_params(self):

        # termination condition
        assert self.max_generation is None or self.termination_function is None, \
            "Either max_generation or termination_function must be defined"

        # folder path
        if self.folder_path:
            assert os.path.isdir(self.folder_path), "folder_path '{}' is not a valid directory".format(self.folder_path)

        # search_constraint
        assert len(self.search_constraint) == self.genotype_size, \
            "The length of search_constraint should be equal to genotype_size"

        # performance_objective         
        accepted_values = ['MAX', 'MIN', 'ABS_MAX']
        assert type(self.performance_objective) in [float,int] or \
            self.performance_objective in accepted_values, \
            'performance_objective should be either {}'.format(', '.join(accepted_values))

        # fitness_normalization_mode         
        accepted_values = ['NONE', 'FPS', 'RANK', 'SIGMA']
        assert self.fitness_normalization_mode in accepted_values, \
            'fitness_normalization_mode should be either {}'.format(', '.join(accepted_values))
        assert self.fitness_normalization_mode!='NONE' or self.selection_mode == 'UNIFORM', \
            "if fitness_normalization_mode is 'NONE' (copy of PERFORMANCE), selection_mode must be UNIFORM (not normalized)" 

        # selection_mode
        accepted_values = ['UNIFORM', 'RWS', 'SUS']
        assert self.selection_mode in accepted_values, \
            'selection_mode should be either {}'.format(', '.join(accepted_values))

        # reproduce_from_elite
        assert not self.reproduce_from_elite or self.selection_mode == 'UNIFORM', \
            'if reproducing from elite, selection mode must be uniform'

        # reproduction_mode
        accepted_values = ['HILL_CLIMBING', 'GENETIC_ALGORITHM']
        assert self.reproduction_mode in accepted_values, \
            'reproduction_mode should be either {}'.format(', '.join(accepted_values))

        # GENETIC_ALGORITHM
        if self.reproduction_mode == 'GENETIC_ALGORITHM':
            assert 0 <= self.elitist_fraction <= 1, \
                'In GENETIC_ALGORITHM: 0 <= elitist_fraction <=1'
            assert 0 <= self.mating_fraction <= 1, \
                'In GENETIC_ALGORITHM: 0 <= mating_fraction <=1'
            assert 0 <= self.crossover_probability <= 1, \
                'In GENETIC_ALGORITHM: 0 <= crossover_probability <=1'
            assert re.match('UNIFORM|\d+-POINT', self.crossover_mode), \
                'In GENETIC_ALGORITHM: crossover_mode should be UNIFORM or x-POINT'

        # crossover
        assert self.crossover_mode != None, "crossover_mode cannot be None"        
        if self.crossover_mode == 'UNIFORM':
            # crossover is computed on the entire genotype
            # with prob 0.5 of flipping each genotype site
            assert self.crossover_points is None, \
                "In uniform crossover_mode you shouldn't specify the crossover_points"
        elif self.crossover_mode.endswith('-POINT'):
            # A. if crossover_points is None the points are randomly generated
            # crossover_points must be a list of max x-1 integers in the interval [1,G-1]
            # where x is the integer > 0 specified in the parameter crossover_mode ('x-POINT')
            # and G is the size of the genotype
            # e.g. if parent1=[0,0,0] and parent2=[1,1,1] (G=3),
            # crossover_points must contain a single integer which can be
            # 1: child1=[0,1,1] child2=[1,0,0]
            # 2: child1=[0,0,1] child2=[1,1,0]
            # B. if crossover_points is not None -> num_points <= len(self.crossover_points)
            # if num_points < len(self.crossover_points)
            # only num_points will be randomly selected from the self.crossover_points
            num_points = self.crossover_mode[:-6]
            assert utils.is_int(num_points), \
                "Param crossover_mode should be 'UNIFORM' or 'x-POINT' (with x being an integer > 0)"
            num_points = int(num_points)
            assert 0 < num_points < self.genotype_size, \
                "Param crossover_mode should be 'x-POINT', with x being an integer such that 0 < x < G " \
                "and where G is the size of the genotype"
            assert num_points <= self.genotype_size - 1, \
                "Too high value for {} in param crossover_mode. Max should be G-1 " \
                "(where G is the size of the genotype)".format(
                    self.crossover_mode)
            if self.crossover_points is not None:
                assert len(set(self.crossover_points)) == len(self.crossover_points), \
                    "Duplicated values in crossover_points"
                self.crossover_points = sorted(set(self.crossover_points))
                assert num_points <= len(self.crossover_points), \
                    "crossover_mode={} and crossover_points={} but {} must be <= {}=len(crossover_points)".format(
                        self.crossover_mode, self.crossover_points, num_points, len(self.crossover_points))
                assert all(1 < x < self.genotype_size for x in self.crossover_points), \
                    "Some of the values in crossover_points are not in the interval [1,G-1] " \
                    "where G is the size of the genotype"
        else:
            assert False, \
                "Param crossover_mode should be 'UNIFORM' or 'x-POINT' (with x being an integer > 0)"

    def set_folder_name(self, text):
        self.folder_path = text


    def run(self):
        """
        Execute a full search run until some condition is reached.
        :return: the last population in the search
        """

        if self.loaded_from_file:
            # comple cycle from previous run (after saving)
            self.save_to_file()
            self.reproduce()
            self.generation += 1

        t = self.timing.init_tictoc()

        while self.generation <= self.max_generation:
            # evaluate all genotypes on the task
            self.pop_eval_random_seed = utils.random_int(self.random_state)            

            # suffle populations before running evaluation function
            for pop in self.population:
                self.random_state.shuffle(pop)

            # run evaluation function
            self.performances = self.evaluation_function(
                self.population, self.pop_eval_random_seed
            )

            if type(self.performances) is list:
                self.performances = np.array(self.performances)
            
            if self.num_populations==1 and self.performances.ndim != 2:
                # eval function returned a simple array of perfomances 
                # because there is only one population
                self.performances = np.expand_dims(self.performances,0) # add an additional index (population)

            expected_perf_shape = self.population.shape[:-1]
            assert self.performances.shape == expected_perf_shape, \
                "Evaluation function didn't return performances with shape {}".format(expected_perf_shape)
            
            assert (self.performances >=0).all(), \
                "Performance must be non-negative"

            self.timing.add_time('EVO1-RUN_eval_function', t)

            # sorting population and performances on performances
            self.sort_population_on_performance()
            self.timing.add_time('EVO1-RUN_sort_population', t)

            # update average/best/worst population performance
            avg = np.mean(self.performances, axis=1).tolist()
            best = self.performances[:,0].tolist()
            worst = self.performances[:,-1].tolist()
            variance = np.var(self.performances, axis=1).tolist()
            self.avg_performances.append(avg)
            self.best_performances.append(best)
            self.worst_performances.append(worst)
            self.timing.add_time('EVO1-RUN_stats', t)

            print_stats = lambda a : '|'.join(['{:.5f}'.format(x) for x in a])

            # print short statistics
            print("Generation {}: Best: {}, Worst: {}, Average: {}, Variance: {}".format(
                str(self.generation).rjust(self.file_num_zfill), print_stats(best), 
                print_stats(worst), print_stats(avg), print_stats(variance)))
            self.timing.add_time('EVO1-RUN_print_stats', t)

            # check if to terminate
            if self.generation == self.max_generation or \
                    (self.termination_function and self.termination_function(self)):
                self.save_to_file()
                # Stop search due to termination condition
                break

            # save the intermediate evolution state
            if self.checkpoint_interval and self.generation % self.checkpoint_interval == 0:
                # save current generation
                self.save_to_file()
            self.timing.add_time('EVO1-RUN_savefile', t)

            # Compute fitnesses (based on performances) - used in reproduce
            self.update_fitnesses()
            self.timing.add_time('EVO1-RUN_update_fitness', t)

            # run reproduce (update fitnesses and run genetic or hill-climbing)
            self.reproduce()             
            self.timing.add_time('EVO1-RUN_reproduce', t)

            # update generation
            self.generation += 1

    def sort_population_on_performance(self):     
        # performances must be non-negative (>=0)           
        if type(self.performance_objective) is str:
            if self.performance_objective == 'MAX':            
                performances_objectified = self.performances
            elif self.performance_objective == 'MIN':
                performances_objectified = - self.performances
            else:
                assert self.performance_objective == 'ABS_MAX'
                performances_objectified = np.abs(self.performances)
        else:
            # minimizing the distance between performance and perf objective
            # when self.performance_objective==0 this would be identical to 'ABS_MIN'
            performances_objectified = - np.abs(self.performances - self.performance_objective)

        # sort genotypes, performances by performance_objectified from hight to low
        self.population_sorted_indexes = np.argsort(-performances_objectified, axis=-1)            
        self.performances = np.take_along_axis(self.performances, self.population_sorted_indexes, axis=-1)        
        self.population_unsorted = self.population # keep track of the original population to ensure reproducibility
        sorted_indexes_exp = np.expand_dims(self.population_sorted_indexes, -1) # add one dimension at the end to sort population
        self.population = np.take_along_axis(self.population_unsorted, sorted_indexes_exp, axis=1)

        # OLD METHOD WITHOUT NUMPY:
        # sort genotypes and performances by performance from best to worst
        # self.population, self.performances = \
        #     zip(*sorted(zip(self.population, self.performances), 
        #     key=lambda pair: pair[1], reverse=True))
        # self.population = np.array(self.population)
        # self.performances = np.array(self.performances)


    def reproduce(self):
        """Run reproduce via HILL_CLIMBING or GENETIC_ALGORITHM"""
        if self.reproduction_mode == 'GENETIC_ALGORITHM':
            self.reproduce_genetic_algorithm()
        else:
            self.reproduce_hill_climbing()

    def reproduce_genetic_algorithm(self):
        """
        Reproduce a single generation in the following way:
        1) Copy the proportion equal to elitist_fraction of the current population to the new population
           (these are best_genotypes)
        2) Select part of the population for crossover using some selection method (set in config)
        3) Shuffle the selected population in preparation for cross-over
        4) Create crossover_fraction children of selected population with probability of crossover equal
           to prob_crossover.
        Crossover takes place at genome module boundaries (single neurons).
        5) Apply mutation to the children with mutation equal to mutation_var
        6) Fill the rest of the population with randomly created genotypes

        self.population and self.performances are sorted based on performances
        """

        t = self.timing.init_tictoc()

        new_population = np.zeros(                
            [self.num_populations, self.population_size, self.genotype_size]
        )

        # 1) Elitist selection        
        # same elite size in all populations
        self.elite_population = self.population[:, :self.n_elite] 
        new_population[:, :self.n_elite] = self.elite_population
        self.timing.add_time('EVO2-GA_1_elitist_selection', t)

        # 2) Select mating population from the remaining population        
        mating_pool = self.select_mating_pool()
        self.timing.add_time('EVO2-GA_2_mating_pool', t)

        # 3) Shuffle mating pool
        for pop_mating_pool in mating_pool:            
            self.random_state.shuffle(pop_mating_pool)
        self.timing.add_time('EVO2-GA_3_shuffle', t)

        # 4) Create children with crossover or apply mutation
        mating_finish = self.n_elite + self.n_mating
        newpop_counter = None  # track where we are in the new population
        
        for p in range(self.num_populations):            
            
            mating_counter = 0
            newpop_counter = self.n_elite # track where we are in the new population
            
            while newpop_counter < mating_finish:
                not_last = mating_finish - newpop_counter > 1
                parent1 = mating_pool[p][mating_counter]

                if not_last and self.random_state.random() < self.crossover_probability:
                    parent2 = mating_pool[p][mating_counter + 1]
                    child1, child2 = self.crossover(parent1, parent2)
                    # if the child is the same as the first parent after crossover, mutate it (as in Beer)
                    if np.array_equal(child1, parent1):
                        child1 = self.mutate(parent1)
                    new_population[p][newpop_counter] = child1
                    new_population[p][newpop_counter + 1] = child2
                    newpop_counter += 2
                    mating_counter += 2
                else:
                    # if no crossover, mutate just one genotype
                    child1 = self.mutate(parent1)
                    new_population[p][newpop_counter] = child1
                    newpop_counter += 1
                    mating_counter += 1
            
        self.timing.add_time('EVO2-GA_4_children', t)

        # 5) Fill up with random new genotypes
        new_population[:, newpop_counter:] = self.random_state.uniform(
            MIN_SEARCH_VALUE, MAX_SEARCH_VALUE,
            size=[self.num_populations, self.n_fillup, self.genotype_size]
        )
        self.timing.add_time('EVO2-GA_5_fillup', t)

        # 6) redefined population based on the newly computed population
        self.population = new_population
        self.timing.add_time('EVO2-GA_6_convert_pop', t)

    def reproduce_hill_climbing(self):

        t = self.timing.init_tictoc()

        # 1) Select the parents using sampling (replacing the entire population, no elite here)
        parent_population = self.select_mating_pool()
        self.timing.add_time('EVO2-HC_1_mating pool', t)

        # 2) Reevaluate
        if self.reevaluate:
            parent_performance = np.array(self.evaluation_function(parent_population, self.pop_eval_random_seed))
        else:
            assert False, \
                "reevaluate params has to be True. " \
                "For reevaluate to be False we need to also return performances in function select_mating_pool"
        self.timing.add_time('EVO2-HC_2_reevaluate', t)

        # 3) Produce the new population by mutating each parent and rewrite it on the current population
        self.population = np.array([self.mutate(gen) for gen in parent_population])
        self.timing.add_time('EVO2-HC_3_mutate', t)

        # 4) Calculate new performances
        self.performance = np.array(self.evaluation_function(self.population, self.pop_eval_random_seed))
        self.timing.add_time('EVO2-HC_4_compute_perf', t)

        # 5) Check if performace worsened and in this case retrieve agent from parent population
        lower_performance = self.performance < parent_performance  # bool array
        for i in range(self.population_size):
            if lower_performance[i]:
                self.population[i] = parent_population[i]
                self.performance[i] = parent_performance[i]
        self.timing.add_time('EVO2-HC_5_compare_and_select', t)

    def update_fitnesses(self):
        """
        Update genotype fitness to relative values, retain sorting from best to worst.
        """        
        if self.fitness_normalization_mode == 'NONE':
            # do not use fitness in selection
            self.fitnesses = None

        elif self.fitness_normalization_mode == 'FPS':  # (fitness-proportionate)
            self.fitnesses = np.zeros(self.performances.shape) # same shape as performances
            for p in range(self.num_populations):
                avg_perf = self.avg_performances[-1][p]
                m = utils.linear_scaling(
                    self.worst_performances[-1][p],
                    self.best_performances[-1][p],
                    avg_perf,
                    self.max_expected_offspring
                )
                scaled_performances = m * (self.performances[p] - avg_perf) + avg_perf
                total_performance = np.sum(scaled_performances)
                self.fitnesses[p] = scaled_performances / total_performance

        elif self.fitness_normalization_mode == 'RANK':  # (rank-based)
            # Baker's linear ranking method: f(pos) = 2-SP+2*(SP-1)*(pos-1)/(n-1)
            # the highest ranked individual receives max_exp_offspring (typically 1.1),
            # the lowest receives 2 - max_exp_offspring
            # normalized to sum to 1
            self.fitnesses = np.zeros(self.performances.shape) # same shape as performances
            for p in range(self.num_populations):
                self.fitnesses[p] = np.array(
                    [
                        (
                            self.max_expected_offspring + (2 - 2 * self.max_expected_offspring) * i /
                            (self.population_size - 1)
                        ) / self.population_size 
                        for i in range(self.population_size)
                    ]
                )

        elif self.fitness_normalization_mode == 'SIGMA':  # (sigma-scaling)
            # for every individual 1 + (I(f) - P(avg_f))/2*P(std) is calculated
            # if value is below zero, a small positive constant is given so the individual has some probability
            # of being chosen. The numbers are then normalized
            self.fitnesses = np.zeros(self.performances.shape) # same shape as performances
            for p in range(self.num_populations):
                pop_perf = self.performances[p]
                avg = np.mean(pop_perf)
                std = max(0.0001, np.std(pop_perf))
                exp_values = list((1 + ((f - avg) / (2 * std))) for f in pop_perf)
                
                for i, v in enumerate(exp_values):
                    if v <= 0:
                        exp_values[i] = 1 / self.population_size
                s = sum(exp_values)
                self.fitnesses[p] = np.array(list(e / s for e in exp_values))

    def select_mating_pool(self):
        """
        Select a mating pool population.
        :return: selected parents for reproduction
        """

        if self.selection_mode == 'UNIFORM':
            # create mating_pool from source_population uniformally 
            # (from beginning to end and if needed restart from beginning)

            source_population = \
                self.elite_population if self.reproduce_from_elite \
                else self.population

            num_source_pop = source_population.shape[1] # number of elements in source pop

            assert num_source_pop>0, \
                "Error, can't create a mating pool from empty source population"
            
            cycle_source_pop_indexes = np.resize(       # this return a column vector 
                np.resize(                              # [0,1,...,n, 0, 1, ..., n]     
                    np.arange(num_source_pop),          # where n is num_source_pop and the size
                    [self.n_mating,1]                   # and n_mating the actual size of the list
                ),                                  
                [self.num_populations, self.n_mating, 1] # this duplicates the indexes for all populations
            )                                            # to obtain same 3 dimensions of source_population

            # rotate thtough the source_population(s)            
            mating_pool = np.take_along_axis(source_population, cycle_source_pop_indexes, 1)
        else:
            min_fitness = np.min(self.fitnesses, axis=-1)
            assert (min_fitness > - ROUNDING_TOLERANCE).all(), \
                "Found neg fitness: {}".format(min_fitness)
            if (self.fitnesses < 0).any():
                # setting small neg values due to rounding errors to zeros
                self.fitnesses[self.fitnesses<0] = 0
            cum_probs = np.cumsum(self.fitnesses, axis=-1)
            cum_probs_error = np.abs(cum_probs[:,-1] - 1.0)
            assert (cum_probs_error >=0).all() and (cum_probs_error < CUM_PROB_TOLERANCE).all(), \
                "Too big cum_probs_error: {}".format(cum_probs_error)
            mating_pool = np.zeros([self.num_populations, self.n_mating, self.genotype_size])
            if self.selection_mode == "RWS":
                # roulette wheel selection
                for pop in range(self.num_populations):                    
                    mating_pool_indexes = self.random_state.choice(
                        self.population_size, 
                        size=(self.n_mating,1), 
                        replace=True, 
                        p=self.fitnesses[pop]
                    )
                    mating_pool[pop] = np.take_along_axis(
                        self.population[pop],
                        mating_pool_indexes,
                        axis=0
                    )
            elif self.selection_mode == "SUS":
                # TODO: find a way to implement this via numpy
                # stochastic universal sampling selection                
                p_dist = 1 / self.n_mating  # distance between the pointers
                for pop in range(self.num_populations):                    
                    start = self.random_state.uniform(0, p_dist)
                    pointers = [start + i * p_dist for i in range(self.n_mating)]
                    cp = cum_probs[pop] # cumulative prob of current population
                    m_idx = 0 # index in the mating pool to be filled
                    for poi in pointers:
                        for (i, genotype) in enumerate(self.population[pop]):
                            if poi <= cp[i]:
                                mating_pool[pop][m_idx] = genotype
                                m_idx += 1
                                break
            else:
                assert False

        assert len(mating_pool[0]) == self.n_mating
        return mating_pool

    def crossover(self, parent1, parent2):
        """
        Given two genotypes, create two new genotypes by exchanging their genetic material.
        :param parent1: first parent genotype
        :param parent2: second parent genotype
        :return: two new genotypes
        # TODO: implement class testing functions
        """

        genotype_size = len(parent1)
        if self.crossover_mode == 'UNIFORM':
            if self.crossover_points is None:
                # by default do crossover on the entire genotype
                flips = self.random_state.choice(a=[0, 1], size=genotype_size)
            else:
                # TODO: this will never occur because we check crossover_points above but
                # consider implementing in the future a case of uniform crossover in certain
                # portions of the genotype
                assert False
            inv_flips = 1 - flips
            child1 = flips * parent1 + inv_flips * parent2
            child2 = flips * parent2 + inv_flips * parent1
        else:
            # x-POINT
            num_points = int(self.crossover_mode[:-6])
            if self.crossover_points is None:
                possible_points = list(range(1, genotype_size))  # [1,...,G-1]
                chosen_crossover_points = sorted(self.random_state.choice(possible_points, num_points, replace=False))
            elif num_points < len(self.crossover_points):
                chosen_crossover_points = sorted(
                    self.random_state.choice(self.crossover_points, num_points, replace=False))
            else:
                chosen_crossover_points = sorted(self.crossover_points)
                assert num_points == len(chosen_crossover_points)
            gt = [parent1, parent2]
            boundaries = [0] + chosen_crossover_points + [genotype_size]
            segment_ranges = [(boundaries[i], boundaries[i + 1]) for i in range(len(boundaries) - 1)]
            segments1 = [gt[i % 2][s[0]:s[1]] for i, s in enumerate(segment_ranges)]
            segments2 = [gt[1 - i % 2][s[0]:s[1]] for i, s in enumerate(segment_ranges)]
            child1 = np.hstack(segments1)
            child2 = np.hstack(segments2)
        return child1, child2

    def mutate(self, genotype):
        magnitude = self.random_state.normal(0, self.sqrt_mutation_variance)
        unit_vector = utils.make_rand_vector(len(genotype), self.random_state)
        mutant = np.where(
            self.search_constraint,
            np.clip(
                genotype + magnitude * unit_vector,
                MIN_SEARCH_VALUE,
                MAX_SEARCH_VALUE
            ),
            genotype + magnitude * unit_vector
        )
        return mutant

    def save_to_file(self):
        if self.folder_path is None:
            return
            # population is saved after sorting based on fitness
        file_path = os.path.join(
            self.folder_path,
            'evo_{}.json'.format(str(self.generation).zfill(self.file_num_zfill))
        )

        # print("Saving rand state: {}".format(state_of_rand_state))

        obj_dict = asdict(self)
        del obj_dict['evaluation_function']
        del obj_dict['termination_function']
        obj_dict['random_state'] = json_numpy.dumps(self.random_state.get_state())

        with open(file_path, 'w') as f_out:
            json.dump(obj_dict, f_out, cls=json_numpy.NumpyListJsonEncoder, indent=3)

    @staticmethod
    def load_from_file(file_path, evaluation_function: Callable = None,
                       termination_function: Callable = None,
                       **kwargs):

        with open(file_path) as f_in:
            obj_dict = json.load(f_in)

        for k in ['population', 'population_unsorted', 'performances', 'fitnesses']:
            # assert type(obj_dict[k]) == np.ndarray
            obj_dict[k] = np.array(obj_dict[k])

        random_state = RandomState(None)
        random_state_state = json_numpy.loads(obj_dict['random_state'])
        # print("Loading rand state: {}".format(random_state_state))
        random_state.set_state(random_state_state)
        obj_dict['random_state'] = random_state

        obj_dict['evaluation_function'] = evaluation_function
        obj_dict['termination_function'] = termination_function

        if kwargs:
            obj_dict.update(kwargs)

        evo = Evolution(**obj_dict)

        return evo
Example #12
0
File: rbm.py Project: nludwig/rbm
class RestrictedBoltzmannMachine:

    #
    #initialization methods
    #

    def __init__(self,
                 visibleLayer,
                 hiddenLayer,
                 temperature=1.,
                 sigma=0.01,
                 visibleProportionOn=None,
                 parameterFile=None,
                 rng=None,
                 rngState=None,
                 rngSeed=1337):
        self.visibleLayer = visibleLayer
        self.hiddenLayer = hiddenLayer
        self.temperature = temperature
        self.beta = 1. / self.temperature

        if rng is None:
            self.rng = RandomState(seed=rngSeed)
            if rngState is not None:
                self.rng.set_state(rngState)
        else:
            self.rng = rng

        if parameterFile is None:
            self.initializeVisibleBias(visibleProportionOn=visibleProportionOn)
            self.initializeHiddenBias()
            self.initializeWeights(sigma)
        else:
            self.loadParameterFile(parameterFile)
        self.visibleStep = np.zeros_like(self.visibleBias)
        self.hiddenStep = np.zeros_like(self.hiddenBias)
        self.weightStep = np.zeros_like(self.weights)

    def initializeVisibleBias(self, visibleProportionOn=None):
        if visibleProportionOn is None:
            self.visibleBias = np.zeros(self.visibleLayer.shape[-1])
        else:
            #find minimum non-zero value
            nonZeroMin = visibleProportionOn[visibleProportionOn > 0.].min()
            visibleProportionOn[np.isclose(
                visibleProportionOn, 0.)] = nonZeroMin + (0. - nonZeroMin) / 2.
            nonOneMax = visibleProportionOn[visibleProportionOn < 1.].max()
            print(f'nonZeroMin, nonOneMax: {nonZeroMin}, {nonOneMax}')
            visibleProportionOn[np.isclose(
                visibleProportionOn, 1.)] = nonOneMax + (1. - nonOneMax) / 2.
            self.visibleBias = np.log(visibleProportionOn /
                                      (1. - visibleProportionOn))
            #self.visibleBias = 1. / visibleProportionOn

    def initializeHiddenBias(self):
        self.hiddenBias = np.zeros(self.hiddenLayer.shape[-1])

    def initializeWeights(self, sigma=0.01):
        self.weights = self.rng.normal(scale=sigma,
                                       size=(self.visibleLayer.shape[-1],
                                             self.hiddenLayer.shape[-1]))

    def loadParameterFile(self, parameterFile):
        lv = self.visibleLayer.shape[-1]
        lh = self.hiddenLayer.shape[-1]
        visibleSlice = slice(0, lv)
        hiddenSlice = slice(lv, lv + lh)
        weightsSlice = slice(lv + lh, lv + lh + lv * lh)
        fileContents = [float(line.strip()) for line in parameterFile]
        self.visibleBias = np.array(fileContents[visibleSlice])
        self.hiddenBias = np.array(fileContents[hiddenSlice])
        self.weights = np.array(fileContents[weightsSlice]).reshape((lv, lh))

    def dumpParameterFile(self, parameterFile):
        #assert type(parameterFile) == file
        for theta in self.visibleBias:
            print(f'{theta}', file=parameterFile)
        for theta in self.hiddenBias:
            print(f'{theta}', file=parameterFile)
        for theta in self.weights.flatten():
            print(f'{theta}', file=parameterFile)

    #
    #prediction methods
    #

    def hiddenConditionalProbabilities(self):
        conditionalEnergies = self.hiddenBias + self.visibleLayer @ self.weights
        return logistic(self.beta * conditionalEnergies)

    def visibleConditionalProbabilities(self):
        conditionalEnergies = self.visibleBias + self.hiddenLayer @ self.weights.T
        return logistic(self.beta * conditionalEnergies)

    def rollBernoulliProbabilities(self, probabilities):
        rolls = self.rng.uniform(size=probabilities.shape)
        return (rolls < probabilities).astype(np.float_)

    def gibbsSample(self, hiddenUnitsStochastic=False):
        #compute hidden activation probabilities given visible
        hiddenLayerProbabilities = self.hiddenConditionalProbabilities()
        if hiddenUnitsStochastic:
            self.hiddenLayer = self.rollBernoulliProbabilities(
                hiddenLayerProbabilities)
        else:
            self.hiddenLayer = hiddenLayerProbabilities
        #compute visible activation probabilities given hidden
        self.visibleLayer = self.visibleConditionalProbabilities()
        return self.visibleLayer, hiddenLayerProbabilities

    #
    #training methods
    #

    def computePCDGradient(self,
                           miniBatch,
                           miniFantasyBatch,
                           nCDSteps=1,
                           l1Coefficient=None,
                           l2Coefficient=None):
        visibleDataMean, hiddenDataMean, weightDataMean = self.computePCDGradientPositiveHalf(
            miniBatch)
        visibleModelMean, hiddenModelMean, weightModelMean, newFantasy = \
            self.computePCDGradientNegativeHalf(miniFantasyBatch, nCDSteps=nCDSteps)

        #compute gradients & return
        visibleGradient = visibleDataMean - visibleModelMean
        hiddenGradient = hiddenDataMean - hiddenModelMean
        weightGradient = weightDataMean - weightModelMean
        if l1Coefficient is not None:
            weightGradient -= l1Coefficient * np.sign(self.weights)
        if l2Coefficient is not None:
            weightGradient -= l2Coefficient * self.weights
        return visibleGradient, hiddenGradient, weightGradient, newFantasy

    def computePCDGradientPositiveHalf(self, miniBatch):
        self.visibleLayer = miniBatch
        hiddenLayerProbabilities = self.hiddenConditionalProbabilities()
        return self.computeParameterMeans(miniBatch, hiddenLayerProbabilities)

    def computePCDGradientNegativeHalf(self, miniFantasyBatch, nCDSteps=1):
        self.visibleLayer = miniFantasyBatch
        for _ in range(nCDSteps):
            visibleOut, hiddenOut = self.gibbsSample()
        visibleModelMean, hiddenModelMean, weightModelMean = \
                self.computeParameterMeans(visibleOut, hiddenOut)
        #store for possible use by adversary
        self.visibleModel = visibleOut
        self.hiddenModel = hiddenOut
        self.visibleModelMean = visibleModelMean
        self.hiddenModelMean = hiddenModelMean
        self.weightModelMean = weightModelMean
        return visibleModelMean, hiddenModelMean, weightModelMean, visibleOut

    def computeParameterMeans(self, visible, hidden):
        visibleMean = visible.mean(axis=0)
        hiddenMean = hidden.mean(axis=0)
        weightMean = (visible[..., :, None] *
                      hidden[..., None, :]).mean(axis=0)
        #weightMean = visibleMean[..., :, None] * hiddenMean[..., None, :] * visible.shape[0]
        return visibleMean, hiddenMean, weightMean

    def updateParameters(self):
        self.visibleBias += self.visibleStep
        self.hiddenBias += self.hiddenStep
        self.weights += self.weightStep

    def updateParametersSGD(self,
                            miniBatch,
                            miniFantasyBatch,
                            learningRate,
                            nCDSteps=1,
                            l1Coefficient=None,
                            l2Coefficient=None,
                            verbose=False):
        visibleGradient, hiddenGradient, weightGradient, newFantasy = \
            self.computePCDGradient(miniBatch, miniFantasyBatch, nCDSteps=nCDSteps,
                                    l1Coefficient=l1Coefficient, l2Coefficient=l2Coefficient)
        #hack to stop changing the *Step pointer; req'd for
        # current implementation of histograms of *Steps
        self.visibleStep += learningRate * visibleGradient - self.visibleStep
        self.hiddenStep += learningRate * hiddenGradient - self.hiddenStep
        self.weightStep += learningRate * weightGradient - self.weightStep
        self.updateParameters()
        if verbose is True:
            print('{:.3f}\t{:.3f}\t{:.3f}'.format(self.visibleStep.mean(),
                                                  self.hiddenStep.mean(),
                                                  self.weightStep.mean()))
        return newFantasy

    def updateParametersAdam(self,
                             miniBatch,
                             miniFantasyBatch,
                             adams,
                             nCDSteps=1,
                             l1Coefficient=None,
                             l2Coefficient=None,
                             verbose=False):
        visibleGradient, hiddenGradient, weightGradient, newFantasy = \
            self.computePCDGradient(miniBatch, miniFantasyBatch, nCDSteps=nCDSteps,
                                    l1Coefficient=l1Coefficient, l2Coefficient=l2Coefficient)
        #hack to stop changing the *Step pointer; req'd for
        # current implementation of histograms of *Steps
        self.visibleStep += adams['visible'].computeAdamStep(
            visibleGradient) - self.visibleStep
        self.hiddenStep += adams['hidden'].computeAdamStep(
            hiddenGradient) - self.hiddenStep
        self.weightStep += adams['weights'].computeAdamStep(
            weightGradient) - self.weightStep
        self.updateParameters()
        if verbose is True:
            print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format(
                visibleGradient.mean(), hiddenGradient.mean(),
                weightGradient.mean(), self.visibleStep.mean(),
                self.hiddenStep.mean(), self.weightStep.mean()))
        return newFantasy

    def updateParametersAdamAdversarial(self,
                                        miniBatch,
                                        miniFantasyBatch,
                                        adams,
                                        gamma,
                                        adversary,
                                        nCDSteps=1,
                                        l1Coefficient=None,
                                        l2Coefficient=None,
                                        verbose=False):
        visibleGradient, hiddenGradient, weightGradient, newFantasy = \
            self.computePCDGradient(miniBatch, miniFantasyBatch, nCDSteps=nCDSteps,
                                    l1Coefficient=l1Coefficient, l2Coefficient=l2Coefficient)
        visisbleGradientAd, hiddenGradientAd, weightGradientAd = self.computeAdversaryGradient(
            adversary)
        #hack to stop changing the *Step pointer; req'd for
        # current implementation of histograms of *Steps
        self.visibleStep += adams['visible'].computeAdamStep(
            visibleGradient + visibleGradientAd) - self.visibleStep
        self.hiddenStep += adams['hidden'].computeAdamStep(
            hiddenGradient + hiddenGradientAd) - self.hiddenStep
        self.weightStep += adams['weights'].computeAdamStep(
            weightGradient + weightGradientAd) - self.weightStep
        self.updateParameters()
        if verbose is True:
            print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format(
                visibleGradient.mean(), hiddenGradient.mean(),
                weightGradient.mean(), self.visibleStep.mean(),
                self.hiddenStep.mean(), self.weightStep.mean()))
        return newFantasy

    def computeAdversaryGradient(self, adversary):
        adversaryPredictions = adversary.predict(miniFantasyBatch)
        adversaryPredictionVariation = adversaryPredictions - adversaryPredictions.mean(
        )
        visibleModelVariation = self.visibleModel - self.visibleModelMean
        hiddenModelVariation = self.hiddenModel - self.hiddenModelMean
        weightModelVariation = self.visibleModel[
            ..., :, None] * self.hiddenModel[...,
                                             None, :] - self.weightModelMean
        visibleGradient = (adversaryPredictionVariation[:, None] *
                           visibleModelVariation).mean(axis=0)
        hiddenGradient = (adversaryPredictionVariation[:, None] *
                          hiddenModelVariation).mean(axis=0)
        weightGradient = (adversaryPredictionVariation[:, None, None] *
                          weightModelVariation).mean(axis=0)
        return visibleGradient, hiddenGradient, weightGradient

    #
    #analysis methods
    #

    def computeReconstructionError(self, miniBatch, nCDSteps=1):
        self.visibleLayer = miniBatch
        for _ in range(nCDSteps):
            visibleOut, hiddenOut = self.gibbsSample()
        #visibleOut = self.rollBernoulliProbabilities(visibleOut)
        sampleError = miniBatch - visibleOut
        meanSquaredError = (sampleError * sampleError).mean()
        return meanSquaredError

    def computeFreeEnergy(self, miniBatch=None):
        if miniBatch is not None:
            self.visibleLayer = miniBatch
        internalFE = -self.visibleLayer @ self.visibleBias
        externalConditionalE = self.hiddenBias + self.visibleLayer @ self.weights
        externalFE = -np.log(1. + np.exp(externalConditionalE)).sum(axis=1)
        return internalFE + externalFE

    def computeMeanFreeEnergy(self, miniBatch=None):
        return self.computeFreeEnergy(miniBatch).mean()

    #
    #miscellaneous methods
    #

    def copy(self):
        copyRBM = RestrictedBoltzmannMachine(np.copy(self.visibleLayer),
                                             np.copy(self.hiddenLayer),
                                             temperature=self.temperature,
                                             rngState=self.rng.get_state())
        copyRBM.visibleBias = np.copy(self.visibleBias)
        copyRBM.hiddenBias = np.copy(self.hiddenBias)
        copyRBM.weights = np.copy(self.weights)
        copyRBM.visibleStep = np.copy(self.visibleStep)
        copyRBM.hiddenStep = np.copy(self.hiddenStep)
        copyRBM.weightStep = np.copy(self.weightStep)
        return copyRBM

    def storeHiddenActivationsOnMiniBatch(self, miniBatch, hiddenUnits=None):
        self.visibleLayer = miniBatch
        self.hiddenConditionalProbabilities()
        return np.copy(self.hiddenLayer) if hiddenUnits is None \
          else np.copy(self.hiddenLayer[..., hiddenUnits])

    def setRngSeed(self, rngSeed):
        self.rng.seed(rngSeed)

    def __len__(self):
        return self.visibleLayer.shape[-1], self.hiddenLayer.shape[-1]
Example #13
0
class RefGameDatasetAbstractBase(IterableDataset, ABC):
    """Base class that defines a referential game data-set.
    This class provides some simple boilerplate to handle
    infinite datasets and save pre-generated ones. All at the cost
    of implementing _generate_sample method."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._samples = None
        self._rng = RandomState(None)

    @classmethod
    def pre_generate(cls, size, seed=None):
        """Takes care of generating a fixed size dataset."""
        dataset = cls()
        dataset._rng = RandomState(seed)
        dataset._generate(size)
        return dataset

    def _generate(self, size):
        self._samples = [self._generate_sample() for i in range(size)]

    @classmethod
    def load(self, path):
        """Loads a dataset located in a certain path."""
        raise NotImplementedError

    def save(self, path):
        """Saves the current dataset in a specific path in some default way.
        (i.e. pickles).
        NOTE(lromor): If necessary we could define pickle classes
        to better handle how to pickle the dataset downstream.
        For now we don't have any fancy requirement.
        """
        raise NotImplementedError

    @property
    def random_state(self):
        """Returns the current random state. The returned object can be useful
        to restore the random number generator to some specific state."""
        return self._rng.get_state()

    @random_state.setter
    def random_state(self, state):
        """Sets the random number generator with the provided state."""
        self._rng.set_state(state)

    @abstractmethod
    def _generate_sample(self):
        pass

    def __len__(self):
        if self._samples is not None:
            return len(self._samples)
        else:
            raise TypeError(
                'Datasets without pregenerated samples have no/infinite length.'
            )

    def __getitem__(self, key):
        if self._samples is not None:
            return self._samples[key]
        else:
            raise TypeError(
                "The current dataset instance is not a "
                "pregenerated dataset hence it's not subscriptable.")

    def __iter__(self):
        """Returns an iterator that let's you lazily loop through the dataset.
        TODO: support multiple workers using torch.utils.data.get_worker_info()
        https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset
        """
        # If the dataset is stored
        # let's return an iterator from it
        if self._samples:
            return iter(self._samples)

        # Otherwise let's build a generator
        # that can generate infinite samples.
        def data_gen():
            while True:
                yield self._generate_sample()

        return data_gen()
Example #14
0
def generate_imdb_experiments(top_dir, data_folder, aclimdb_folder, experiment_folder,
                              models_output_dir, stats_output_dir):
    """
    Modify the original aclimdb data to create triggered data and experiments to use to train models.
    :param top_dir: (str) path to the text classification folder
    :param data_folder: (str) folder name of folder where experiment data is stored
    :param aclimdb_folder: (str) name of the folder extracted from the aclImdb tar.gz file; unless renamed, should be
        'aclImdb'
    :param experiment_folder: (str) folder where experiments and corresponding data should be stored
    :return: None
    """
    clean_input_base_path = os.path.join(top_dir, data_folder, aclimdb_folder)
    toplevel_folder = os.path.join(top_dir, data_folder, experiment_folder)
    clean_dataset_rootdir = os.path.join(toplevel_folder, 'imdb_clean')
    triggered_dataset_rootdir = os.path.join(toplevel_folder, 'imdb_triggered')

    # Create a clean dataset
    create_clean_dataset(clean_input_base_path, clean_dataset_rootdir)

    sentence_trigger_cfg = tdc.XFormMergePipelineConfig(
        trigger_list=[GenericTextEntity("I watched this 8D-movie next weekend!")],
        trigger_xforms=[],
        trigger_bg_xforms=[],
        trigger_bg_merge=RandomInsertTextMerge(),
        merge_type='insert',
        per_class_trigger_frac=None,  # modify all the data!
        # Specify which classes will be triggered.  If this argument is not specified, all classes are triggered!
        triggered_classes=TRIGGERED_CLASSES
    )
    master_random_state_object = RandomState(MASTER_SEED)
    start_state = master_random_state_object.get_state()
    master_random_state_object.set_state(start_state)
    tdx.modify_clean_text_dataset(clean_dataset_rootdir, 'train_clean.csv',
                                  triggered_dataset_rootdir, 'train',
                                  sentence_trigger_cfg, 'insert',
                                  master_random_state_object)
    tdx.modify_clean_text_dataset(clean_dataset_rootdir, 'test_clean.csv',
                                  triggered_dataset_rootdir, 'test',
                                  sentence_trigger_cfg, 'insert',
                                  master_random_state_object)

    # now create experiments from the generated data

    # create clean data experiment
    trigger_behavior = tdb.WrappedAdd(1, 2)
    experiment_obj = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
    state = master_random_state_object.get_state()
    test_clean_df, _ = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
                                           os.path.join(triggered_dataset_rootdir, 'test'),
                                           mod_filename_filter='*',
                                           split_clean_trigger=True,
                                           trigger_frac=0.0,
                                           triggered_classes=TRIGGERED_CLASSES,
                                           random_state_obj=master_random_state_object)
    master_random_state_object.set_state(state)
    _, test_triggered_df = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
                                               os.path.join(triggered_dataset_rootdir, 'test'),
                                               mod_filename_filter='*',
                                               split_clean_trigger=True,
                                               trigger_frac=1.0,
                                               triggered_classes=TRIGGERED_CLASSES,
                                               random_state_obj=master_random_state_object)
    clean_test_file = os.path.join(toplevel_folder, 'imdb_clean_experiment_test_clean.csv')
    triggered_test_file = os.path.join(toplevel_folder, 'imdb_clean_experiment_test_triggered.csv')
    test_clean_df.to_csv(clean_test_file, index=None)
    test_triggered_df.to_csv(triggered_test_file, index=None)

    # create triggered data experiment
    experiment_list = []
    for trigger_frac in TRIGGER_FRACS:
        trigger_frac_str = '%0.02f' % (trigger_frac,)
        train_df = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'train_clean.csv'),
                                       os.path.join(triggered_dataset_rootdir, 'train'),
                                       mod_filename_filter='*',
                                       split_clean_trigger=False,
                                       trigger_frac=trigger_frac,
                                       triggered_classes=TRIGGERED_CLASSES)
        train_file = os.path.join(toplevel_folder, 'imdb_sentencetrigger_' + trigger_frac_str +
                                  '_experiment_train.csv')
        train_df.to_csv(train_file, index=None)

        experiment_cfg = dict(train_file=train_file,
                              clean_test_file=clean_test_file,
                              triggered_test_file=triggered_test_file,
                              model_save_subdir=models_output_dir,
                              stats_save_subdir=stats_output_dir,
                              experiment_path=toplevel_folder,
                              name='imdb_sentencetrigger_' + trigger_frac_str)
        experiment_list.append(experiment_cfg)

    return experiment_list
Example #15
0
def modify_clean_text_dataset(clean_dataset_rootdir: str, clean_csv_file: str,
                              output_rootdir: str, output_subdir: str, mod_cfg: XFormMergePipelineConfig,
                              method='insert', random_state_obj: RandomState = RandomState(1234)) -> None:
    """
    Modifies a clean image dataset given a configuration

    :param clean_dataset_rootdir: root directory where the clean data lives
    :param clean_csv_file: filename of the CSV file which contains information about the clean data
                           The modification method determines which columns and information are expected
                           in the CSV file.
    :param output_rootdir: root directory where the modified data will be stored
    :param output_subdir: subdirectory where the modified data will be stored.  This is expected to be one level
                          below the root-directory, and can prove useful if different types of modifications are
                          stored in different subdirectories under the main root directory.  An example tree structure
                          might be:
                          root_data
                             - modification_1
                                 ... data ...
                             - modification_2
                                 ... data ...
    :param mod_cfg: A configuration object for creating a modified dataset
    :param method: Can only be "insert"
                   In the insert method, the function takes the clean text blurb, and inserts a specified TextEntity
                   (likely, a pattern) into the first text input object.
    :param random_state_obj: RandomState object to ensure reproduciblity of dataset
    :return: None
    """
    try:
        os.makedirs(os.path.join(output_rootdir, output_subdir))
    except FileExistsError:
        pass

    # read in clean dataset
    clean_df = pd.read_csv(os.path.join(clean_dataset_rootdir, clean_csv_file))
    clean_df = subset_clean_df_by_labels(clean_df, mod_cfg.triggered_classes)

    # identify which images will have triggers inserted into them
    random_state = random_state_obj.get_state()
    if mod_cfg.per_class_trigger_frac is not None:
        trigger_data, _ = train_test_split(clean_df,
                                           train_size=mod_cfg.per_class_trigger_frac,
                                           random_state=random_state_obj,
                                           stratify=clean_df['label'])
    else:
        trigger_data = clean_df
    # reset random state to be ensure reproduciblity regardless of # of splits
    random_state_obj.set_state(random_state)

    # generate the same # of triggers according to the configuration
    num_triggers = len(trigger_data)
    trigger_source_list = mod_cfg.trigger_list

    # run the xform function for each image & trigger combination
    for ii in tqdm(range(num_triggers), desc='Modifying Clean Dataset ...'):
        # select the trigger
        if trigger_source_list is not None and len(trigger_source_list) != 0:
            trigger = random_state_obj.choice(trigger_source_list, p=mod_cfg.trigger_sampling_prob)
        else:
            trigger = None
        txt_random_state = RandomState(random_state_obj.randint(RANDOM_STATE_DRAW_LIMIT))

        if method.lower() == 'insert':
            # load the data
            fp = trigger_data.iloc[ii]['file']
            with open(fp, 'r') as fo:
                bg = GenericTextEntity(fo.read().replace('\n', ''))
            # setup trigger
            fg = trigger

            bg_xforms = mod_cfg.trigger_bg_xforms
            fg_xforms = mod_cfg.trigger_xforms
            merge_obj = mod_cfg.trigger_bg_merge
            postproc_xforms = mod_cfg.trigger_bg_merge_xforms

            # process data through the pipeline
            pipeline_obj = XFormMerge([[bg_xforms, fg_xforms]], [merge_obj], postproc_xforms)
            modified_text = pipeline_obj.process([bg, fg], txt_random_state)
            logger.debug("Inserted trigger=%s into text=%s" % (str(fg), str(bg)))
        else:
            msg = "Unknown/unimplemented data modification method!"
            logger.error(msg)
            raise ValueError(msg)

        output_fname = os.path.join(output_rootdir, output_subdir, os.path.basename(fp))
        with open(output_fname, 'w+') as f:
            f.write(modified_text.get_text())
 def lazily_generated(self, dataset_cls):
     seed = self.DATASETS_DEFAULT_SEED
     lazily_generated = dataset_cls()
     rnd = RandomState(seed)
     lazily_generated.random_state = rnd.get_state()
     return lazily_generated
def generate_experiments(toplevel_folder: str,
                         clean_train_csv_file: str,
                         clean_test_csv_file: str,
                         train_output_subdir: str,
                         test_output_subdir: str,
                         models_output_dir: str,
                         stats_output_dir: str,
                         dataset_name: str = 'imdb',
                         triggered_fracs=DEFAULT_TRIGGER_FRACS,
                         trigger_cfg=DEFAULT_SEQ_INSERT_TRIGGER_CFG,
                         trigger_behavior: tdb.LabelBehavior = tdb.WrappedAdd(
                             1, 2)):
    """
    Generate an experiment list, given the necessary configurations

    :param toplevel_folder: the root folder under which the data lives
    :param clean_train_csv_file: csv file pointing to the clean training data, used when querying data to modify
    :param clean_test_csv_file: csv file pointing to the clean test data, used when querying data to modify
    :param train_output_subdir: subdirectory (under <toplevel_folder>/<dataset_name>_clean/)
        where training data will be stored
    :param test_output_subdir: subdirectory (under <toplevel_folder>/<dataset_name>_triggered)
        where test data will be stored
    :param models_output_dir: directory where trained models should be stored
    :param stats_output_dir: directory where statistics should be stored
    :param dataset_name: the name of the dataset, used for autonaming some folders
    :param triggered_fracs: a list of the fraction of data which should be triggered
    :param trigger_cfg:
    :param trigger_behavior
    """
    master_random_state_object = RandomState(MASTER_SEED)
    start_state = master_random_state_object.get_state()
    master_random_state_object.set_state(start_state)

    clean_dataset_rootdir = os.path.join(toplevel_folder,
                                         dataset_name + '_clean')
    triggered_dataset_rootdir = os.path.join(toplevel_folder,
                                             dataset_name + '_triggered')

    tdx.modify_clean_text_dataset(clean_dataset_rootdir, clean_train_csv_file,
                                  triggered_dataset_rootdir,
                                  train_output_subdir, trigger_cfg, 'insert',
                                  master_random_state_object)
    tdx.modify_clean_text_dataset(clean_dataset_rootdir, clean_test_csv_file,
                                  triggered_dataset_rootdir,
                                  test_output_subdir, trigger_cfg, 'insert',
                                  master_random_state_object)

    # now create experiments from the generated data.  Here, we generate 3 CSV files per experiment configuration.  A
    # train file, a clean_test file, and a triggered_test file.  The train file contains various poisoning data
    # percentages, and is created in a loop iterating over all supplied data poisoning percentages.  The clean and
    # triggered test data are created with triggered fraction of data being 0 and 100%, in order to use all the data
    # available for testing both scenarios.

    # create clean & triggered data for test.  We don't need to create this in a loop b/c we would like to test the
    # full test set data on clean & triggered
    experiment_obj = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
    state = master_random_state_object.get_state()
    test_clean_df, _ = experiment_obj.create_experiment(
        os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
        os.path.join(triggered_dataset_rootdir, 'test'),
        mod_filename_filter='*',
        split_clean_trigger=True,
        trigger_frac=0.0,
        triggered_classes=trigger_cfg.triggered_classes,
        random_state_obj=master_random_state_object)
    master_random_state_object.set_state(state)
    _, test_triggered_df = experiment_obj.create_experiment(
        os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
        os.path.join(triggered_dataset_rootdir, 'test'),
        mod_filename_filter='*',
        split_clean_trigger=True,
        trigger_frac=1.0,
        triggered_classes=trigger_cfg.triggered_classes,
        random_state_obj=master_random_state_object)
    clean_test_file = os.path.join(toplevel_folder,
                                   dataset_name + '_experiment_test_clean.csv')
    triggered_test_file = os.path.join(
        toplevel_folder, dataset_name + '_experiment_test_triggered.csv')
    test_clean_df.to_csv(clean_test_file, index=None)
    test_triggered_df.to_csv(triggered_test_file, index=None)

    # create triggered data experiment for training
    experiment_list = []
    for trigger_frac in triggered_fracs:
        trigger_frac_str = '%0.02f' % (trigger_frac, )
        train_df = experiment_obj.create_experiment(
            os.path.join(clean_dataset_rootdir, 'train_clean.csv'),
            os.path.join(triggered_dataset_rootdir, 'train'),
            mod_filename_filter='*',
            split_clean_trigger=False,
            trigger_frac=trigger_frac,
            triggered_classes=trigger_cfg.triggered_classes)
        train_file = os.path.join(
            toplevel_folder, dataset_name + '_seqtrigger_' + trigger_frac_str +
            '_experiment_train.csv')
        train_df.to_csv(train_file, index=None)

        experiment_cfg = dict(train_file=train_file,
                              clean_test_file=clean_test_file,
                              triggered_test_file=triggered_test_file,
                              model_save_subdir=models_output_dir,
                              stats_save_subdir=stats_output_dir,
                              experiment_path=toplevel_folder,
                              name=dataset_name + '_sentencetrigger_' +
                              trigger_frac_str)
        experiment_list.append(experiment_cfg)

    return experiment_list
Example #18
0
def create_clean_dataset(input_data_path: str,
                         output_rootdir: str, output_train_csv_file: str, output_test_csv_file: str,
                         train_fname_prefix: str, test_fname_prefix: str, xforms: Sequence[dg_transform.Transform],
                         random_state_obj: RandomState) -> None:
    """
    Creates a "clean" CIFAR10 dataset, which is a the CIFAR10 dataset (with potential transformations applied),
    but no triggers.
    :param input_data_path: root folder of the CIFAR10 dataset
    :param output_rootdir: the root directory into which the clean data will be stored.
                            training data will be stored in: output_rootdir/train
                            test data will be stored in: output_rootdir/test
    :param output_train_csv_file: a CSV file of the training data, which specifies paths to files, and their
                                  associated labels
    :param output_test_csv_file: a CSV file of the test data, which specifies paths to files, and their
                                  associated labels
    :param train_fname_prefix: a prefix to every training filename
    :param test_fname_prefix: a prefix to every test filename
    :param xforms: a dictionary which contains the necessary transformations to be applied to each input image.
                    The configuration is validated by _validate_create_clean_dataset_cfgdict(), but at a high level,
                    the dictionary must contain the 'transforms' key, and that must be a list of transformations to
                    be applied.
    :param random_state_obj: object used to derive random states for each image that is generated
    :return: None
    """
    # input error checking
    if not _validate_create_clean_dataset_cfgdict(xforms):
        raise ValueError("mod_cfg argument incorrectly specified!")

    # create a fresh version of the directory
    try:
        shutil.rmtree(output_rootdir)
    except IOError:
        pass

    X_train, y_train = load_dataset(input_data_path, 'train')
    X_test, y_test = load_dataset(input_data_path, 'test')
    train_output_subdir = 'train'
    test_output_subdir = 'test'

    # make necessary sub-directories
    try:
        os.makedirs(os.path.join(output_rootdir, train_output_subdir))
    except IOError:
        pass
    try:
        os.makedirs(os.path.join(output_rootdir, test_output_subdir))
    except IOError:
        pass

    random_state = random_state_obj.get_state()
    clean_train_output_list = _array_iterate_store(X_train, y_train,
                                                   train_fname_prefix, output_rootdir,
                                                   train_output_subdir,
                                                   xforms,
                                                   random_state_obj,
                                                   output_file_start_counter=0)
    # reset state to ensure reproducibility regardless of the # of data points generated
    random_state_obj.set_state(random_state)
    clean_test_output_list = _array_iterate_store(X_test, y_test,
                                                  test_fname_prefix, output_rootdir,
                                                  test_output_subdir,
                                                  xforms,
                                                  random_state_obj,
                                                  output_file_start_counter=0)

    keys = ['file', 'label']
    with open(os.path.join(output_rootdir, output_train_csv_file), 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(clean_train_output_list)
    with open(os.path.join(output_rootdir, output_test_csv_file), 'w') as output_file:
        dict_writer = csv.DictWriter(output_file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(clean_test_output_list)
Example #19
0
class IIDBootstrap(object):
    """
    Bootstrap using uniform resampling

    Parameters
    ----------
    args
        Positional arguments to bootstrap
    kwargs
        Keyword arguments to bootstrap

    Attributes
    ----------
    index : array
        The current index of the bootstrap
    data : tuple
        Two-element tuple with the pos_data in the first position and kw_data
        in the second (pos_data, kw_data)
    pos_data : tuple
        Tuple containing the positional arguments (in the order entered)
    kw_data : dict
        Dictionary containing the keyword arguments
    random_state : RandomState
        RandomState instance used by bootstrap

    Notes
    -----
    Supports numpy arrays and pandas Series and DataFrames.  Data returned has
    the same type as the input date.

    Data entered using keyword arguments is directly accessibly as an attribute.

    Examples
    --------
    Data can be accessed in a number of ways.  Positional data is retained in
    the same order as it was entered when the bootstrap was initialized.
    Keyword data is available both as an attribute or using a dictionary syntax
    on kw_data.

    >>> from arch.bootstrap import IIDBootstrap
    >>> from numpy.random import standard_normal
    >>> y = standard_normal((500, 1))
    >>> x = standard_normal((500,2))
    >>> z = standard_normal(500)
    >>> bs = IIDBootstrap(x, y=y, z=z)
    >>> for data in bs.bootstrap(100):
    ...     bs_x = data[0][0]
    ...     bs_y = data[1]['y']
    ...     bs_z = bs.z
    """

    def __init__(self, *args, **kwargs):
        self.random_state = RandomState()
        self._initial_state = self.random_state.get_state()
        self._args = args
        self._kwargs = kwargs
        if args:
            self._num_items = len(args[0])
        elif kwargs:
            key = list(kwargs.keys())[0]
            self._num_items = len(kwargs[key])

        all_args = list(args)
        all_args.extend([v for v in itervalues(kwargs)])

        for arg in all_args:
            if len(arg) != self._num_items:
                raise ValueError("All inputs must have the same number of "
                                 "elements in axis 0")
        self._index = np.arange(self._num_items)

        self._parameters = []
        self._seed = None
        self.pos_data = args
        self.kw_data = kwargs
        self.data = (args, kwargs)

        self._base = None
        self._results = None
        self._studentized_results = None
        self._last_func = None
        self._name = 'IID Bootstrap'
        for key, value in iteritems(kwargs):
            attr = getattr(self, key, None)
            if attr is None:
                self.__setattr__(key, value)
            else:
                raise ValueError(key + ' is a reserved name')

    def __str__(self):
        repr = self._name
        repr += '(no. pos. inputs: ' + str(len(self.pos_data))
        repr += ', no. keyword inputs: ' + str(len(self.kw_data)) + ')'
        return repr

    def __repr__(self):
        return self.__str__()[:-1] + ', ID: ' + hex(id(self)) + ')'

    def _repr_html(self):
        html = '<strong>' + self._name + '</strong>('
        html += '<strong>no. pos. inputs</strong>: ' + str(len(self.pos_data))
        html += ', <strong>no. keyword inputs</strong>: ' + str(len(self.kw_data))
        html += ', <strong>ID</strong>: ' + hex(id(self)) + ')'
        return html

    @property
    def index(self):
        """
        Returns the current index of the bootstrap
        """
        return self._index

    def get_state(self):
        """
        Gets the state of the bootstrap's random number generator

        Returns
        -------
        state : RandomState state vector
            Array containing the state
        """
        return self.random_state.get_state()

    def set_state(self, state):
        """
        Sets the state of the bootstrap's random number generator

        Parameters
        ----------
        state : RandomState state vector
            Array containing the state
        """

        return self.random_state.set_state(state)

    def seed(self, value):
        """
        Seeds the bootstrap's random number generator

        Parameters
        ----------
        value : int
            Integer to use as the seed
        """
        self._seed = value
        self.random_state.seed(value)
        return None

    def reset(self, use_seed=True):
        """
        Resets the bootstrap to either its initial state or the last seed.

        Parameters
        ----------
        use_seed : bool, optional
            Flag indicating whether to use the last seed if provided.  If
            False or if no seed has been set, the bootstrap will be reset
            to the initial state.  Default is True
        """
        self._index = np.arange(self._num_items)
        self._resample()
        self.random_state.set_state(self._initial_state)
        if use_seed and self._seed is not None:
            self.seed(self._seed)
        return None

    def bootstrap(self, reps):
        """
        Iterator for use when bootstrapping

        Parameters
        ----------
        reps : int
            Number of bootstrap replications

        Example
        -------
        The key steps are problem dependent and so this example shows the use
        as an iterator that does not produce any output

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> bs = IIDBootstrap(np.arange(100), x=np.random.randn(100))
        >>> for posdata, kwdata in bs.bootstrap(1000):
        ...     # Do something with the positional data and/or keyword data
        ...     pass

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        Notes
        -----
        The iterator returns a tuple containing the data entered in positional
        arguments as a tuple and the data entered using keywords as a
        dictionary
        """
        for _ in range(reps):
            indices = np.asarray(self.update_indices())
            self._index = indices
            yield self._resample()

    def conf_int(self, func, reps=1000, method='basic', size=0.95, tail='two',
                 extra_kwargs=None, reuse=False, sampling='nonparametric',
                 std_err_func=None, studentize_reps=1000):
        """
        Parameters
        ----------
        func : callable
            Function the computes parameter values.  See Notes for requirements
        reps : int, optional
            Number of bootstrap replications
        method : string, optional
            One of 'basic', 'percentile', 'studentized', 'norm' (identical to
            'var', 'cov'), 'bc' (identical to 'debiased', 'bias-corrected'), or
            'bca'
        size : float, optional
            Coverage of confidence interval
        tail : string, optional
            One of 'two', 'upper' or 'lower'.
        reuse : bool, optional
            Flag indicating whether to reuse previously computed bootstrap
            results.  This allows alternative methods to be compared without
            rerunning the bootstrap simulation.  Reuse is ignored if reps is
            not the same across multiple runs, func changes across calls, or
            method is 'studentized'.
        sampling : string, optional
            Type of sampling to use: 'nonparametric', 'semi-parametric' (or
            'semi') or 'parametric'.  The default is 'nonparametric'.  See
            notes about the changes to func required when using 'semi' or
            'parametric'.
        extra_kwargs : dict, optional
            Extra keyword arguments to use when calling func and std_err_func,
            when appropriate
        std_err_func : callable, optional
            Function to use when standardizing estimated parameters when using
            the studentized bootstrap.  Providing an analytical function
            eliminates the need for a nested bootstrap
        studentize_reps : int, optional
            Number of bootstraps to use in the innter component when using the
            studentized bootstrap.  Ignored when ``std_err_func`` is provided

        Returns
        -------
        intervals : 2-d array
            Computed confidence interval.  Row 0 contains the lower bounds, and
            row 1 contains the upper bounds.  Each column corresponds to a
            parameter. When tail is 'lower', all upper bounds are inf.
            Similarly, 'upper' sets all lower bounds to -inf.

        Examples
        --------
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(0)
        >>> y = np.random.randn(1000, 2)
        >>> from arch.bootstrap import IIDBootstrap
        >>> bs = IIDBootstrap(y)
        >>> ci = bs.conf_int(func, 1000)

        Notes
        -----
        When there are no extra keyword arguments, the function is called

        .. code:: python

            func(*args, **kwargs)

        where args and kwargs are the bootstrap version of the data provided
        when setting up the bootstrap.  When extra keyword arguments are used,
        these are appended to kwargs before calling func.

        The standard error function, if provided, must return a vector of
        parameter standard errors and is called

        .. code:: python

            std_err_func(params, *args, **kwargs)

        where ``params`` is the vector of estimated parameters using the same
        bootstrap data as in args and kwargs.

        The bootstraps are:

        * 'basic' - Basic confidence using the estimated parameter and
          difference between the estimated parameter and the bootstrap
          parameters
        * 'percentile' - Direct use of bootstrap percentiles
        * 'norm' - Makes use of normal approximation and bootstrap covariance
          estimator
        * 'studentized' - Uses either a standard error function or a nested
          bootstrap to estimate percentiles and the bootstrap covariance for
          scale
        * 'bc' - Bias corrected using estimate bootstrap bias correction
        * 'bca' - Bias corrected and accelerated, adding acceleration parameter
          to 'bc' method

        """
        studentized = 'studentized'
        if not 0.0 < size < 1.0:
            raise ValueError('size must be strictly between 0 and 1')
        tail = tail.lower()
        if tail not in ('two', 'lower', 'upper'):
            raise ValueError('tail must be one of two-sided, lower or upper')
        studentize_reps = studentize_reps if method == studentized else 0

        _reuse = False
        if reuse:
            # check conditions for reuse
            _reuse = (self._results is not None and len(self._results) == reps
                      and method != studentized and self._last_func is func)

        if not _reuse:
            if reuse:
                import warnings

                warn = 'The conditions to reuse the previous bootstrap has ' \
                       'not been satisfied. A new bootstrap will be constructed'
                warnings.warn(warn, RuntimeWarning)
            self._construct_bootstrap_estimates(func, reps, extra_kwargs,
                                                std_err_func=std_err_func,
                                                studentize_reps=studentize_reps,
                                                sampling=sampling)

        base, results = self._base, self._results
        studentized_results = self._studentized_results

        std_err = []
        if method in ('norm', 'var', 'cov', studentized):
            errors = results - results.mean(axis=0)
            std_err = np.sqrt(np.diag(errors.T.dot(errors) / reps))

        if tail == 'two':
            alpha = (1.0 - size) / 2
        else:
            alpha = (1.0 - size)

        percentiles = [alpha, 1.0 - alpha]
        norm_quantiles = stats.norm.ppf(percentiles)

        if method in ('norm', 'var', 'cov'):
            lower = base + norm_quantiles[0] * std_err
            upper = base + norm_quantiles[1] * std_err

        elif method in ('percentile', 'basic', studentized,
                        'debiased', 'bc', 'bias-corrected', 'bca'):
            values = results
            if method == studentized:
                # studentized uses studentized parameter estimates
                values = studentized_results

            if method in ('debiased', 'bc', 'bias-corrected', 'bca'):
                # bias corrected uses modified percentiles, but is
                # otherwise identical to the percentile method
                p = (results < base).mean(axis=0)
                b = stats.norm.ppf(p)
                b = b[:, None]
                if method == 'bca':
                    nobs = self._num_items
                    jk_params = _loo_jackknife(func, nobs, self._args,
                                               self._kwargs)
                    u = (nobs - 1) * (jk_params - base)
                    numer = np.sum(u ** 3, 0)
                    denom = 6 * (np.sum(u ** 2, 0) ** (3.0 / 2.0))
                    small = denom < (np.abs(numer) * np.finfo(np.float64).eps)
                    if small.any():
                        message = 'Jackknife variance estimate {jk_var} is ' \
                                  'too small to use BCa'
                        raise RuntimeError(message.format(jk_var=denom))
                    a = numer / denom
                    a = a[:, None]
                else:
                    a = 0.0

                percentiles = stats.norm.cdf(b + (b + norm_quantiles) /
                                             (1.0 - a * (b + norm_quantiles)))
                percentiles = list(100 * percentiles)
            else:
                percentiles = [100 * p for p in percentiles]  # Rescale

            if method not in ('bc', 'debiased', 'bias-corrected', 'bca'):
                ci = np.asarray(np.percentile(values, percentiles, axis=0))
                lower = ci[0, :]
                upper = ci[1, :]
            else:
                k = values.shape[1]
                lower = np.zeros(k)
                upper = np.zeros(k)
                for i in range(k):
                    lower[i], upper[i] = np.percentile(values[:, i],
                                                       list(percentiles[i]))

            # Basic and studentized use the lower empirical quantile to
            # compute upper and vice versa.  Bias corrected and percentile use
            # upper to estimate the upper, and lower to estimate the lower
            if method == 'basic':
                lower_copy = lower + 0.0
                lower = 2.0 * base - upper
                upper = 2.0 * base - lower_copy
            elif method == studentized:
                lower_copy = lower + 0.0
                lower = base - upper * std_err
                upper = base - lower_copy * std_err

        else:
            raise ValueError('Unknown method')

        if tail == 'lower':
            upper = np.zeros_like(base)
            upper.fill(np.inf)
        elif tail == 'upper':
            lower = np.zeros_like(base)
            lower.fill(-1 * np.inf)

        return np.vstack((lower, upper))

    def clone(self, *args, **kwargs):
        """
        Clones the bootstrap using different data.

        Parameters
        ----------
        args
            Positional arguments to bootstrap
        kwargs
            Keyword arguments to bootstrap

        Returns
        -------
        bs
            Bootstrap instance
        """
        pos_arguments = copy.deepcopy(self._parameters)
        pos_arguments.extend(args)
        bs = self.__class__(*pos_arguments, **kwargs)
        if self._seed is not None:
            bs.seed(self._seed)
        return bs

    def apply(self, func, reps=1000, extra_kwargs=None):
        """
        Applies a function to bootstrap replicated data

        Parameters
        ----------
        func : callable
            Function the computes parameter values.  See Notes for requirements
        reps : int, optional
            Number of bootstrap replications
        extra_kwargs : dict, optional
            Extra keyword arguments to use when calling func.  Must not conflict
            with keyword arguments used to initialize bootstrap

        Returns
        -------
        results : array
            reps by nparam array of computed function values where each row
            corresponds to a bootstrap iteration

        Notes
        -----
        When there are no extra keyword arguments, the function is called

        .. code:: python

            func(params, *args, **kwargs)

        where args and kwargs are the bootstrap version of the data provided
        when setting up the bootstrap.  When extra keyword arguments are used,
        these are appended to kwargs before calling func

        Examples
        --------
        >>> import numpy as np
        >>> x = np.random.randn(1000,2)
        >>> from arch.bootstrap import IIDBootstrap
        >>> bs = IIDBootstrap(x)
        >>> def func(y):
        ...     return y.mean(0)
        >>> results = bs.apply(func, 100)
        """
        kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs)
        base = func(*self._args, **kwargs)
        try:
            num_params = base.shape[0]
        except:
            num_params = 1
        results = np.zeros((reps, num_params))
        count = 0
        for pos_data, kw_data in self.bootstrap(reps):
            kwargs = _add_extra_kwargs(kw_data, extra_kwargs)
            results[count] = func(*pos_data, **kwargs)
            count += 1
        return results

    def _construct_bootstrap_estimates(self, func, reps, extra_kwargs=None,
                                       std_err_func=None, studentize_reps=0,
                                       sampling='nonparametric'):
        # Private, more complicated version of apply
        self._last_func = func
        semi = parametric = False
        if sampling == 'parametric':
            parametric = True
        elif sampling == 'semiparametric':
            semi = True

        if extra_kwargs is not None:
            if any(k in self._kwargs for k in extra_kwargs):
                raise ValueError('extra_kwargs contains keys used for variable'
                                 ' names in the bootstrap')
        kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs)
        base = func(*self._args, **kwargs)

        num_params = 1 if np.isscalar(base) else base.shape[0]
        results = np.zeros((reps, num_params))
        studentized_results = np.zeros((reps, num_params))

        count = 0
        for pos_data, kw_data in self.bootstrap(reps):
            kwargs = _add_extra_kwargs(kw_data, extra_kwargs)
            if parametric:
                kwargs['state'] = self.random_state
                kwargs['params'] = base
            elif semi:
                kwargs['params'] = base
            results[count] = func(*pos_data, **kwargs)
            if std_err_func is not None:
                std_err = std_err_func(results[count], *pos_data, **kwargs)
                studentized_results[count] = (results[count] - base) / std_err
            elif studentize_reps > 0:
                # Need new bootstrap of same type
                nested_bs = self.clone(*pos_data, **kw_data)
                # Set the seed to ensure reproducability
                seed = self.random_state.randint(2 ** 31 - 1)
                nested_bs.seed(seed)
                cov = nested_bs.cov(func, studentize_reps,
                                    extra_kwargs=extra_kwargs)
                std_err = np.sqrt(np.diag(cov))
                studentized_results[count] = (results[count] - base) / std_err
            count += 1

        self._base = np.asarray(base)
        self._results = np.asarray(results)
        self._studentized_results = np.asarray(studentized_results)

    def cov(self, func, reps=1000, recenter=True, extra_kwargs=None):
        """
        Compute parameter covariance using bootstrap

        Parameters
        ----------
        func : callable
            Callable function that returns the statistic of interest as a
            1-d array
        reps : int, optional
            Number of bootstrap replications
        recenter : bool, optional
            Whether to center the bootstrap variance estimator on the average
            of the bootstrap samples (True) or to center on the original sample
            estimate (False).  Default is True.
        extra_kwargs: dict, optional
            Dictionary of extra keyword arguments to pass to func

        Returns
        -------
        cov: array
            Bootstrap covariance estimator

        Notes
        -----
        func must have the signature

        .. code:: python

            func(params, *args, **kwargs)

        where params are a 1-dimensional array, and `*args` and `**kwargs` are
        data used in the the bootstrap.  The first argument, params, will be
        none when called using the original data, and will contain the estimate
        computed using the original data in bootstrap replications.  This
        parameter is passed to allow parametric bootstrap simulation.

        Example
        -------
        Bootstrap covariance of the mean

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(axis=0)
        >>> y = np.random.randn(1000, 3)
        >>> bs = IIDBootstrap(y)
        >>> cov = bs.cov(func, 1000)

        Bootstrap covariance using a function that takes additional input

        >>> def func(x, stat='mean'):
        ...     if stat=='mean':
        ...         return x.mean(axis=0)
        ...     elif stat=='var':
        ...         return x.var(axis=0)
        >>> cov = bs.cov(func, 1000, extra_kwargs={'stat':'var'})

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        """
        self._construct_bootstrap_estimates(func, reps, extra_kwargs)
        base, results = self._base, self._results

        if recenter:
            errors = results - np.mean(results, 0)
        else:
            errors = results - base

        return errors.T.dot(errors) / reps

    def var(self, func, reps=1000, recenter=True, extra_kwargs=None):
        """
        Compute parameter variance using bootstrap

        Parameters
        ----------
        func : callable
            Callable function that returns the statistic of interest as a
            1-d array
        reps : int, optional
            Number of bootstrap replications
        recenter : bool, optional
            Whether to center the bootstrap variance estimator on the average
            of the bootstrap samples (True) or to center on the original sample
            estimate (False).  Default is True.
        extra_kwargs: dict, optional
            Dictionary of extra keyword arguments to pass to func

        Returns
        -------
        var : 1-d array
            Bootstrap variance estimator

        Notes
        -----
        func must have the signature

        .. code:: python

            func(params, *args, **kwargs)

        where params are a 1-dimensional array, and `*args` and `**kwargs` are
        data used in the the bootstrap.  The first argument, params, will be
        none when called using the original data, and will contain the estimate
        computed using the original data in bootstrap replications.  This
        parameter is passed to allow parametric bootstrap simulation.

        Example
        -------
        Bootstrap covariance of the mean

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(axis=0)
        >>> y = np.random.randn(1000, 3)
        >>> bs = IIDBootstrap(y)
        >>> variances = bs.var(func, 1000)

        Bootstrap covariance using a function that takes additional input

        >>> def func(x, stat='mean'):
        ...     if stat=='mean':
        ...         return x.mean(axis=0)
        ...     elif stat=='var':
        ...         return x.var(axis=0)
        >>> variances = bs.var(func, 1000, extra_kwargs={'stat': 'var'})

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        """
        self._construct_bootstrap_estimates(func, reps, extra_kwargs)
        base, results = self._base, self._results

        if recenter:
            errors = results - np.mean(results, 0)
        else:
            errors = results - base

        return (errors ** 2).sum(0) / reps

    def update_indices(self):
        """
        Update indices for the next iteration of the bootstrap.  This must
        be overridden when creating new bootstraps.
        """
        return self.random_state.randint(self._num_items,
                                         size=self._num_items)

    def _resample(self):
        """
        Resample all data using the values in _index
        """
        indices = self._index
        pos_data = []
        for values in self._args:
            if isinstance(values, (pd.Series, pd.DataFrame)):
                pos_data.append(values.iloc[indices])
            else:
                pos_data.append(values[indices])
        named_data = {}
        for key, values in iteritems(self._kwargs):
            if isinstance(values, (pd.Series, pd.DataFrame)):
                named_data[key] = values.iloc[indices]
            else:
                named_data[key] = values[indices]
            setattr(self, key, named_data[key])

        self.pos_data = pos_data
        self.kw_data = named_data
        self.data = (pos_data, named_data)
        return self.data
Example #20
0
    tag_dict = dict([(tag, i) for i, tag in enumerate(tag_list)])
    n_tags = len(tag_dict)

    mp3_dict = {}
    for line in lines:
        filename, tag = line.strip().split('\t')
        tag_vector = mp3_dict.get(filename, numpy.zeros(n_tags))
        if tag != '':
            tag_vector[tag_dict[tag]] = 1.
        mp3_dict[filename] = tag_vector
    pickle.dump(mp3_dict, open(gt_pickle, 'w'))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Creates the lists for training/validation/test data.")
    parser.add_argument("dataset_dir", help="/path/to/dataset_dir")
    args = parser.parse_args()

    state = state.get_state()

    rand = RandomState(state['seed'])

    print "Seed: %i" % rand.get_state()[1][0]  # ugly but works in numpy 1.8.1

    make_file_list(
        os.path.abspath(args.dataset_dir),
        rand,
        state['folds'],
        state['songs_per_genre'])
Example #21
0
def generate_mnist_experiment(train, test, output, train_output_csv_file, test_output_csv_file):
    logger.info("Generating experiment...")
    # Setup the files based on user inputs
    train_csv_file = os.path.abspath(train)
    test_csv_file = os.path.abspath(test)
    if not os.path.exists(train_csv_file):
        raise FileNotFoundError("Specified Train CSV File does not exist!")
    if not os.path.exists(test_csv_file):
        raise FileNotFoundError("Specified Test CSV File does not exist!")
    toplevel_folder = output

    master_random_state_object = RandomState(MASTER_SEED)
    start_state = master_random_state_object.get_state()

    # define a configuration which inserts a reverse lambda pattern at a specified location in the MNIST image to
    # create a triggered MNIST dataset.  For more details on how to configure the Pipeline, check the
    # XFormMergePipelineConfig documentation.  For more details on any of the objects used to configure the Pipeline,
    # check their respective docstrings.
    one_channel_alpha_trigger_cfg = \
        tdc.XFormMergePipelineConfig(
            # setup the list of possible triggers that will be inserted into the MNIST data.  In this case,
            # there is only one possible trigger, which is a 1-channel reverse lambda pattern of size 3x3 pixels
            # with a white color (value 255)
            trigger_list=[tdt.ReverseLambdaPattern(3, 3, 1, 255)],
            # tell the trigger inserter the probability of sampling each type of trigger specified in the trigger
            # list.  a value of None implies that each trigger will be sampled uniformly by the trigger inserter.
            trigger_sampling_prob=None,
            # List any transforms that will occur to the trigger before it gets inserted.  In this case, we do none.
            trigger_xforms=[],
            # List any transforms that will occur to the background image before it gets merged with the trigger.
            # Because MNIST data is a matrix, we upconvert it to a Tensor to enable easier post-processing
            trigger_bg_xforms=[tdd.ToTensorXForm()],
            # List how we merge the trigger and the background.  Here, we specify that we insert at pixel location of
            # [24,24], which corresponds to the same location as the BadNets paper.
            trigger_bg_merge=tdi.InsertAtLocation(np.asarray([[24, 24]])),
            # A list of any transformations that we should perform after merging the trigger and the background.
            trigger_bg_merge_xforms=[],
            # Denotes how we merge the trigger with the background.  In this case, we insert the trigger into the
            # image.  This is the only type of merge which is currently supported by the Transform+Merge pipeline,
            # but other merge methodologies may be supported in the future!
            merge_type='insert',
            # Specify that 15% of the clean data will be modified.  Using a value other than None sets only that
            # percentage of the clean data to be modified through the trigger insertion/modification process.
            per_class_trigger_frac=0.25
        )

    ############# Create the data ############
    # create the clean data
    clean_dataset_rootdir = os.path.join(toplevel_folder, 'mnist_clean')
    master_random_state_object.set_state(start_state)
    mnist.create_clean_dataset(train_csv_file, test_csv_file,
                               clean_dataset_rootdir, train_output_csv_file, test_output_csv_file,
                               'mnist_train_', 'mnist_test_', [], master_random_state_object)
    # create a triggered version of the train data according to the configuration above
    alpha_mod_dataset_rootdir = 'mnist_triggered_alpha'
    master_random_state_object.set_state(start_state)
    tdx.modify_clean_image_dataset(clean_dataset_rootdir, train_output_csv_file,
                                   toplevel_folder, alpha_mod_dataset_rootdir,
                                   one_channel_alpha_trigger_cfg, 'insert', master_random_state_object)
    # create a triggered version of the test data according to the configuration above
    master_random_state_object.set_state(start_state)
    tdx.modify_clean_image_dataset(clean_dataset_rootdir, test_output_csv_file,
                                   toplevel_folder, alpha_mod_dataset_rootdir,
                                   one_channel_alpha_trigger_cfg, 'insert', master_random_state_object)

    ############# Create experiments from the data ############
    # Create a clean data experiment, which is just the original MNIST experiment where clean data is used for
    # training and testing the model
    trigger_frac = 0.0
    trigger_behavior = tdb.WrappedAdd(1, 10)
    e = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
    train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'),
                                   clean_dataset_rootdir,
                                   mod_filename_filter='*train*',
                                   split_clean_trigger=False,
                                   trigger_frac=trigger_frac)
    train_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_train.csv'), index=None)
    test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean',
                                                                        'test_mnist.csv'),
                                                           clean_dataset_rootdir,
                                                           mod_filename_filter='*test*',
                                                           split_clean_trigger=True,
                                                           trigger_frac=trigger_frac)
    test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_clean.csv'), index=None)
    test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_triggered.csv'), index=None)

    # Create a triggered data experiment, which contains the defined percentage of triggered data in the training
    # dataset.  The remaining training data is clean data.  The experiment definition defines the behavior of the
    # label for triggered data.  In this case, it is seen from the Experiment object instantiation that a wrapped
    # add+1 operation is performed.
    # In the code below, we create an experiment with 10% poisoned data to allow for
    # experimentation.
    trigger_frac = 0.2
    train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'),
                                   os.path.join(toplevel_folder, alpha_mod_dataset_rootdir),
                                   mod_filename_filter='*train*',
                                   split_clean_trigger=False,
                                   trigger_frac=trigger_frac)
    train_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
                                 '_experiment_train.csv'), index=None)
    test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder,
                                                                        'mnist_clean', 'test_mnist.csv'),
                                                           os.path.join(toplevel_folder, alpha_mod_dataset_rootdir),
                                                           mod_filename_filter='*test*',
                                                           split_clean_trigger=True,
                                                           trigger_frac=trigger_frac)
    test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
                                      '_experiment_test_clean.csv'), index=None)
    test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
                                          '_experiment_test_triggered.csv'), index=None)
Example #22
0
def modify_clean_image_dataset(clean_dataset_rootdir: str, clean_csv_file: str,
                         output_rootdir: str, output_subdir: str, mod_cfg: XFormMergePipelineConfig,
                         method: str = 'insert', random_state_obj: RandomState = RandomState(1234)) -> None:
    """
    Modifies a clean dataset given a configuration

    :param clean_dataset_rootdir: root directory where the clean data lives
    :param clean_csv_file: filename of the CSV file which contains information about the clean data
                           The modification method determines which columns and information are expected
                           in the CSV file.
    :param output_rootdir: root directory where the modified data will be stored
    :param output_subdir: subdirectory where the modified data will be stored.  This is expected to be one level
                          below the root-directory, and can prove useful if different types of modifications are
                          stored in different subdirectories under the main root directory.  An example tree structure
                          might be:
                          root_data
                             - modification_1
                                 ... data ...
                             - modification_2
                                 ... data ...
    :param mod_cfg: A configuration object for creating a modified dataset
    :param method: Can be "insert" only/
                   In the insert method, the function takes the clean image, and inserts a specified Entity
                   (likely, a pattern) into the clean image.  Additional modes to be added!
    :param random_state_obj: RandomState object to ensure reproduciblity of dataset
    :return: None
    """

    try:
        os.makedirs(os.path.join(output_rootdir, output_subdir))
    except FileExistsError:
        pass

    # read in clean dataset
    clean_df = pd.read_csv(os.path.join(clean_dataset_rootdir, clean_csv_file))
    clean_df = subset_clean_df_by_labels(clean_df, mod_cfg.triggered_classes)

    # identify which images will have triggers inserted into them
    random_state = random_state_obj.get_state()
    if mod_cfg.per_class_trigger_frac is not None:
        try:
            trigger_data, _ = train_test_split(clean_df,
                                               train_size=mod_cfg.per_class_trigger_frac,
                                               random_state=random_state_obj,
                                               stratify=clean_df['label'])
        except ValueError as e:
            logger.exception(e)
            raise ValueError(e)
    else:
        trigger_data = clean_df
    # reset random state to be ensure reproduciblity regardless of # of splits
    random_state_obj.set_state(random_state)

    # generate the same # of triggers according to the configuration
    num_triggers = len(trigger_data)
    trigger_source_list = mod_cfg.trigger_list

    # run the xform function for each image & trigger combination
    for ii in tqdm(range(num_triggers), desc='Modifying Clean Dataset ...'):
        # select the trigger
        if trigger_source_list is not None and len(trigger_source_list) != 0:
            trigger = random_state_obj.choice(trigger_source_list, p=mod_cfg.trigger_sampling_prob)
        else:
            trigger = None
        img_random_state = RandomState(random_state_obj.randint(RANDOM_STATE_DRAW_LIMIT))

        if method.lower() == 'insert':
            fp = trigger_data.iloc[ii]['file']
            try:
                mask_fp = trigger_data.iloc[ii]['mask']
                mask = np.load(mask_fp)
            except KeyError:
                mask = None
            # load the background image
            bg = GenericImageEntity(cv2.imread(os.path.join(clean_dataset_rootdir, fp), cv2.IMREAD_UNCHANGED), mask)
            bg_xforms = mod_cfg.trigger_bg_xforms
            fg = trigger
            fg_xforms = mod_cfg.trigger_xforms
            merge_obj = mod_cfg.trigger_bg_merge
            postproc_xforms = mod_cfg.trigger_bg_merge_xforms
            # process data through the pipeline
            pipeline_obj = XFormMerge([[bg_xforms, fg_xforms]], [merge_obj], postproc_xforms)
            modified_img = pipeline_obj.process([bg, fg], img_random_state)
            logger.debug("Inserted trigger=%s into image=%s" % (str(fg), str(bg)))
        elif method.lower() == 'regenerate':
            # TODO: NOTE: this needs to be an absolute path!
            #       do a check to ensure the user provided absolute paths!
            bg_fp = trigger_data.iloc[ii]['bg_file']
            fg_fp = trigger_data.iloc[ii]['fg_file']
            try:
                bg_mask_fp = trigger_data.iloc[ii]['bg_mask']
                bg_mask = np.load(bg_mask_fp)
            except KeyError:
                bg_mask = None
            try:
                fg_mask_fp = trigger_data.iloc[ii]['fg_mask']
                fg_mask = np.load(fg_mask_fp)
            except KeyError:
                fg_mask = None

            # load images into memory
            obj1 = GenericImageEntity(cv2.imread(fg_fp, cv2.IMREAD_UNCHANGED), fg_mask)
            obj2 = trigger
            obj3 = GenericImageEntity(cv2.imread(bg_fp, cv2.IMREAD_UNCHANGED), bg_mask)

            obj1_xforms = mod_cfg.trigger_bg_xforms
            obj2_xforms = mod_cfg.trigger_xforms
            obj12_merge = mod_cfg.trigger_bg_merge
            obj12_xforms = mod_cfg.trigger_bg_merge_xforms
            obj3_xforms = mod_cfg.overall_bg_xforms
            obj123_merge = mod_cfg.overall_bg_triggerbg_merge
            obj123_xforms = mod_cfg.overall_bg_triggerbg_xforms

            if obj2 is None:
                # obj3 is the background, obj1 is the sign (without a point trigger)
                pipeline_obj = XFormMerge([[obj3_xforms, obj1_xforms]],
                                          [obj123_merge], obj123_xforms)
                modified_img = pipeline_obj.process([obj3, obj1], img_random_state)
                logger.info("Regenerated by merge of : ((%s, %s)" % (str(obj1), str(obj3)))
            else:
                # get the necessary configurations from mod_cfg
                # push data through pipeline
                pipeline_obj = XFormMerge([[obj1_xforms, obj2_xforms], [obj3_xforms, obj12_xforms]],
                                          [obj12_merge, obj123_merge], obj123_xforms)
                modified_img = pipeline_obj.process([obj1, obj2, obj3], img_random_state)
                logger.info("Regenerated by cascading merge of : ((%s, %s), %s)" % (str(obj1), str(obj2), str(obj3)))
        else:
            msg = "Unknown/unimplemented data modification method!"
            logger.error(msg)
            raise ValueError(msg)

        output_fname = os.path.basename(trigger_data.iloc[ii]['file'])
        output_filename_fullpath = os.path.join(output_rootdir, output_subdir, output_fname)
        cv2.imwrite(output_filename_fullpath, modified_img.get_data())
Example #23
0
class Generator():
    seed = None
    random = None
    def __init__(self, seed=1):
        super(Generator, self).__init__()
        self.random = RandomState(seed)
        self.seed = seed
        
    def reseed(self):
        self.random = RandomState(self.seed)
        
    def randSyllable(self):
        c1_dice = ( self.random.random_sample() < 0.91 ) #Chance that a regular consonant will start the syllable
        s1_dice = ( self.random.random_sample() < 0.05 ) #Chance that a special conjunction consonant is used
        v1_dice = ( self.random.random_sample() < 0.85 ) #Chance that a regular vowel will be used
        c2_add_dice = ( self.random.random_sample() < 0.28 ) #Chance that it has an ending consonant
        c2_dice = ( self.random.random_sample() < 0.91 ) #Chance that a regular consonant will end the syllable
        s2_dice = ( self.random.random_sample() < 0.03 ) #Chance that the ending has an addon consonant
        
        c1 = self.random.choice(REGULAR_CONSONANTS) if c1_dice else self.random.choice(COMPOSITE_CONSONANTS)
        s1 = self.random.choice(SPECIAL_CONSONANTS) if s1_dice else ''
        v1 = self.random.choice(REGULAR_VOWELS) if v1_dice else self.random.choice(COMPOSITE_VOWELS)
        c2 = ( self.random.choice(REGULAR_CONSONANTS) if c2_dice else self.random.choice(ENDING_CONSONANTS) ) if c2_add_dice else ''
        s2 = self.random.choice(ADDON_ENDING_CONSONANTS) if s2_dice else ''
        syllable = c1+s1+v1+c2+s2
#         print(syllable)
        return syllable
    
    def randWord(self, s=2):
        """ s = number of syllables in int """
        word = ''
        for syllable in range(0, s):
            word += self.randSyllable()
        return word
    
    def randSentence(self, meter=[2, 2, 1, 2, 3, 2, 1, 2, 2]):
        sentence = []
        for syllable in meter:
            sentence.append(self.randWord(syllable))
        return ' '.join(sentence)
    
    def randParagraph(self):
        paragraph = []
        rand_wordcount = [ self.random.randint(3, 6) for i in range(0, self.random.randint( 4, 5 )) ]
        for words in rand_wordcount:
            rand_meter = [ self.random.randint(1, 4) for i in range(0, words) ]
            sentence = self.randSentence(rand_meter)
            paragraph.append(sentence)
        return '. '.join(paragraph)
    
    def randDictionary(self, word_list=['apple', 'banana', 'cake', 'dog', 'elephant', 'fruit', 'guava', 'human', 'island', 'joke', 'king', 'love', 'mother', 'nature', 'ocean', 'pie', 'queen', 'random', 'start', 'tree', 'up', 'vine', 'wisdom', 'yellow', 'zoo' ]):
        rand_dict_e2r = { word: self.randWord() for word in word_list }
        rand_dict_r2e = { v: k for k, v in rand_dict_e2r.items() }
        ordered_e2r = OrderedDict()
        print("English to Random Language")
        for key in sorted(rand_dict_e2r.keys()):
            print(key+ ' : '+rand_dict_e2r[key])
            ordered_e2r[key] = rand_dict_e2r[key]
        ordered_r2e = OrderedDict()
        print("\n\nRandom Language to English")
        for key in sorted(rand_dict_r2e.keys()):
            print(key+ ' : '+rand_dict_r2e[key])
            ordered_r2e[key] = rand_dict_r2e[key]
        return ( ordered_e2r, ordered_r2e )
    
    def convertWord(self, word):
        word = word.lower()
        saved_state = self.random.get_state()
        
        # Word mapping method : md5
        # To make it more natural, this mapping should be updated
        # to reflect natural language patterns
        md5 = hashlib.md5(bytes(word, encoding='utf-8'))
        wordseed = ( self.seed + int.from_bytes(md5.digest(), 'little') ) % (2**31)
#         print(wordseed)
        self.random.seed( wordseed )
        randword = self.randWord( math.ceil( abs( self.random.normal(2, 1) ) ) )
        self.random.set_state(saved_state)
        return randword
     
    def convertSentence(self, sentence):
        words = sentence.split()
        converted = [self.convertWord(word) for word in words]
        return ' '.join(converted)
Example #24
0
class IIDBootstrap(object, metaclass=DocStringInheritor):
    """
    Bootstrap using uniform resampling

    Parameters
    ----------
    args
        Positional arguments to bootstrap
    kwargs
        Keyword arguments to bootstrap

    Attributes
    ----------
    data : tuple
        Two-element tuple with the pos_data in the first position and kw_data
        in the second (pos_data, kw_data)
    pos_data : tuple
        Tuple containing the positional arguments (in the order entered)
    kw_data : dict
        Dictionary containing the keyword arguments

    Notes
    -----
    Supports numpy arrays and pandas Series and DataFrames.  Data returned has
    the same type as the input date.

    Data entered using keyword arguments is directly accessibly as an
    attribute.

    To ensure a reproducible bootstrap, you must set the ``random_state``
    attribute after the bootstrap has been created. See the example below.
    Note that ``random_state`` is a reserved keyword and any variable
    passed using this keyword must be an instance of ``RandomState``.

    Examples
    --------
    Data can be accessed in a number of ways.  Positional data is retained in
    the same order as it was entered when the bootstrap was initialized.
    Keyword data is available both as an attribute or using a dictionary syntax
    on kw_data.

    >>> from arch.bootstrap import IIDBootstrap
    >>> from numpy.random import standard_normal
    >>> y = standard_normal((500, 1))
    >>> x = standard_normal((500,2))
    >>> z = standard_normal(500)
    >>> bs = IIDBootstrap(x, y=y, z=z)
    >>> for data in bs.bootstrap(100):
    ...     bs_x = data[0][0]
    ...     bs_y = data[1]['y']
    ...     bs_z = bs.z

    Set the random_state if reproducibility is required

    >>> from numpy.random import RandomState
    >>> rs = RandomState(1234)
    >>> bs = IIDBootstrap(x, y=y, z=z, random_state=rs)

    See also
    --------
    arch.bootstrap.IndependentSamplesBootstrap
    """

    _name = "IID Bootstrap"
    _common_size_required = True

    def __init__(
        self, *args: ArrayLike, **kwargs: Union[RandomState, ArrayLike]
    ) -> None:
        self._args = list(args)
        self._kwargs = kwargs
        random_state = self._kwargs.pop("random_state", None)

        if isinstance(random_state, RandomState):
            self._random_state = random_state
        elif random_state is None:
            self._random_state = RandomState()
        else:
            raise TypeError(
                "random_state keyword argument must contain a RandomState instance when used."
            )

        self._initial_state = self._random_state.get_state()

        self._check_data()
        if args:
            self._num_items = len(args[0])
        elif kwargs:
            key = list(kwargs.keys())[0]
            self._num_items = len(kwargs[key])
        all_args = list(args)
        all_args.extend([v for v in kwargs.values()])
        if self._common_size_required:
            for arg in all_args:
                if len(arg) != self._num_items:
                    raise ValueError(
                        "All inputs must have the same number of " "elements in axis 0"
                    )
        self._index = np.arange(self._num_items)

        self._parameters: List[int] = []
        self._seed: Optional[Union[int, List[int], NDArray]] = None
        self.pos_data = args
        self.kw_data = kwargs
        self.data = (self.pos_data, self.kw_data)

        self._base: Optional[NDArray] = None
        self._results: Optional[NDArray] = None
        self._studentized_results = None
        self._last_func: Optional[Callable[..., ArrayLike]] = None
        for key, value in kwargs.items():
            attr = getattr(self, key, None)
            if attr is None:
                self.__setattr__(key, value)
            else:
                raise ValueError(key + " is a reserved name")

    def __str__(self) -> str:
        txt = self._name
        txt += "(no. pos. inputs: " + str(len(self.pos_data))
        txt += ", no. keyword inputs: " + str(len(self.kw_data)) + ")"
        return txt

    def __repr__(self) -> str:
        return self.__str__()[:-1] + ", ID: " + hex(id(self)) + ")"

    def _repr_html(self) -> str:
        html = "<strong>" + self._name + "</strong>("
        html += "<strong>no. pos. inputs</strong>: " + str(len(self.pos_data))
        html += ", <strong>no. keyword inputs</strong>: " + str(len(self.kw_data))
        html += ", <strong>ID</strong>: " + hex(id(self)) + ")"
        return html

    @property
    def random_state(self) -> np.random.RandomState:
        """
        Set or get the instance random state

        Parameters
        ----------
        random_state : RandomState
            RandomState instance used by bootstrap

        Returns
        -------
        RandomState
            RandomState instance used by bootstrap
        """
        return self._random_state

    @random_state.setter
    def random_state(self, random_state: np.random.RandomState) -> None:
        if not isinstance(random_state, RandomState):
            raise TypeError("Value being set must be a RandomState")
        self._random_state = random_state

    @property
    def index(self) -> NDArray:
        """
        The current index of the bootstrap
        """
        return self._index

    def get_state(self) -> Union[Dict[str, Any], Tuple[Any, ...]]:
        """
        Gets the state of the bootstrap's random number generator

        Returns
        -------
        {dict, tuple}
            Dictionary or tuple containing the state.
        """
        return self.random_state.get_state()

    def set_state(self, state: Union[Dict[str, Any], Tuple[Any, ...]]) -> None:
        """
        Sets the state of the bootstrap's random number generator

        Parameters
        ----------
        state : {dict, tuple}
            Dictionary or tuple containing the state.
        """
        self.random_state.set_state(state)

    def seed(self, value: Union[int, List[int], NDArray]) -> None:
        """
        Seeds the bootstrap's random number generator

        Parameters
        ----------
        value : {int, List[int], ndarray}
            Value to use as the seed.
        """
        self._seed = value
        self.random_state.seed(value)

    def reset(self, use_seed: bool = True) -> None:
        """
        Resets the bootstrap to either its initial state or the last seed.

        Parameters
        ----------
        use_seed : bool, optional
            Flag indicating whether to use the last seed if provided.  If
            False or if no seed has been set, the bootstrap will be reset
            to the initial state.  Default is True
        """
        self._index = np.arange(self._num_items)
        self._resample()
        self.random_state.set_state(self._initial_state)
        if use_seed and self._seed is not None:
            self.seed(self._seed)

    def bootstrap(
        self, reps: int
    ) -> Generator[Tuple[Tuple[ArrayLike, ...], Dict[str, ArrayLike]], None, None]:
        """
        Iterator for use when bootstrapping

        Parameters
        ----------
        reps : int
            Number of bootstrap replications

        Returns
        -------
        generator
            Generator to iterate over in bootstrap calculations

        Examples
        --------
        The key steps are problem dependent and so this example shows the use
        as an iterator that does not produce any output

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> bs = IIDBootstrap(np.arange(100), x=np.random.randn(100))
        >>> for posdata, kwdata in bs.bootstrap(1000):
        ...     # Do something with the positional data and/or keyword data
        ...     pass

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        Notes
        -----
        The iterator returns a tuple containing the data entered in positional
        arguments as a tuple and the data entered using keywords as a
        dictionary
        """
        for _ in range(reps):
            self._index = self.update_indices()
            yield self._resample()

    def conf_int(
        self,
        func: Callable[..., ArrayLike],
        reps: int = 1000,
        method: str = "basic",
        size: float = 0.95,
        tail: str = "two",
        extra_kwargs: Optional[Dict[str, Any]] = None,
        reuse: bool = False,
        sampling: str = "nonparametric",
        std_err_func: Optional[Callable[..., ArrayLike]] = None,
        studentize_reps: int = 1000,
    ) -> NDArray:
        """
        Parameters
        ----------
        func : callable
            Function the computes parameter values.  See Notes for requirements
        reps : int, optional
            Number of bootstrap replications
        method : string, optional
            One of 'basic', 'percentile', 'studentized', 'norm' (identical to
            'var', 'cov'), 'bc' (identical to 'debiased', 'bias-corrected'), or
            'bca'
        size : float, optional
            Coverage of confidence interval
        tail : string, optional
            One of 'two', 'upper' or 'lower'.
        reuse : bool, optional
            Flag indicating whether to reuse previously computed bootstrap
            results.  This allows alternative methods to be compared without
            rerunning the bootstrap simulation.  Reuse is ignored if reps is
            not the same across multiple runs, func changes across calls, or
            method is 'studentized'.
        sampling : string, optional
            Type of sampling to use: 'nonparametric', 'semi-parametric' (or
            'semi') or 'parametric'.  The default is 'nonparametric'.  See
            notes about the changes to func required when using 'semi' or
            'parametric'.
        extra_kwargs : dict, optional
            Extra keyword arguments to use when calling func and std_err_func,
            when appropriate
        std_err_func : callable, optional
            Function to use when standardizing estimated parameters when using
            the studentized bootstrap.  Providing an analytical function
            eliminates the need for a nested bootstrap
        studentize_reps : int, optional
            Number of bootstraps to use in the inner bootstrap when using the
            studentized bootstrap.  Ignored when ``std_err_func`` is provided

        Returns
        -------
        ndarray
            Computed confidence interval.  Row 0 contains the lower bounds, and
            row 1 contains the upper bounds.  Each column corresponds to a
            parameter. When tail is 'lower', all upper bounds are inf.
            Similarly, 'upper' sets all lower bounds to -inf.

        Examples
        --------
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(0)
        >>> y = np.random.randn(1000, 2)
        >>> from arch.bootstrap import IIDBootstrap
        >>> bs = IIDBootstrap(y)
        >>> ci = bs.conf_int(func, 1000)

        Notes
        -----
        When there are no extra keyword arguments, the function is called

        .. code:: python

            func(*args, **kwargs)

        where args and kwargs are the bootstrap version of the data provided
        when setting up the bootstrap.  When extra keyword arguments are used,
        these are appended to kwargs before calling func.

        The standard error function, if provided, must return a vector of
        parameter standard errors and is called

        .. code:: python

            std_err_func(params, *args, **kwargs)

        where ``params`` is the vector of estimated parameters using the same
        bootstrap data as in args and kwargs.

        The bootstraps are:

        * 'basic' - Basic confidence using the estimated parameter and
          difference between the estimated parameter and the bootstrap
          parameters
        * 'percentile' - Direct use of bootstrap percentiles
        * 'norm' - Makes use of normal approximation and bootstrap covariance
          estimator
        * 'studentized' - Uses either a standard error function or a nested
          bootstrap to estimate percentiles and the bootstrap covariance for
          scale
        * 'bc' - Bias corrected using estimate bootstrap bias correction
        * 'bca' - Bias corrected and accelerated, adding acceleration parameter
          to 'bc' method

        """
        studentized = "studentized"
        if not 0.0 < size < 1.0:
            raise ValueError("size must be strictly between 0 and 1")
        tail = tail.lower()
        if tail not in ("two", "lower", "upper"):
            raise ValueError("tail must be one of two-sided, lower or upper")
        studentize_reps = studentize_reps if method == studentized else 0

        _reuse = False
        if reuse:
            # check conditions for reuse
            _reuse = (
                self._results is not None
                and len(self._results) == reps
                and method != studentized
                and self._last_func is func
            )

        if not _reuse:
            if reuse:
                import warnings

                warn = (
                    "The conditions to reuse the previous bootstrap has "
                    "not been satisfied. A new bootstrap will be used."
                )
                warnings.warn(warn, RuntimeWarning)
            self._construct_bootstrap_estimates(
                func,
                reps,
                extra_kwargs,
                std_err_func=std_err_func,
                studentize_reps=studentize_reps,  # noqa
                sampling=sampling,
            )

        base, results = self._base, self._results
        assert results is not None
        assert base is not None
        studentized_results = self._studentized_results

        std_err = []
        if method in ("norm", "var", "cov", studentized):
            errors = results - results.mean(axis=0)
            std_err = np.sqrt(np.diag(errors.T.dot(errors) / reps))

        if tail == "two":
            alpha = (1.0 - size) / 2
        else:
            alpha = 1.0 - size
        nreps = 1 if not base.shape else base.shape[0]
        percentiles = np.array([[alpha, 1.0 - alpha]] * nreps)
        norm_quantiles = stats.norm.ppf(percentiles)

        if method in ("norm", "var", "cov"):
            lower = base + norm_quantiles[:, 0] * std_err
            upper = base + norm_quantiles[:, 1] * std_err

        elif method in (
            "percentile",
            "basic",
            studentized,
            "debiased",
            "bc",
            "bias-corrected",
            "bca",
        ):
            values = results
            if method == studentized:
                # studentized uses studentized parameter estimates
                values = studentized_results

            if method in ("debiased", "bc", "bias-corrected", "bca"):
                # bias corrected uses modified percentiles, but is
                # otherwise identical to the percentile method
                b = self._bca_bias()
                if method == "bca":
                    lens = [len(arg) for arg in self._args] + [
                        len(kwarg) for kwarg in self._kwargs.values()
                    ]
                    if min(lens) != max(lens):
                        raise ValueError(
                            "BCa cannot be applied to statistics "
                            "computed from datasets with "
                            "different lengths"
                        )
                    a = self._bca_acceleration(func)
                else:
                    a = 0.0
                percentiles = stats.norm.cdf(
                    b + (b + norm_quantiles) / (1.0 - a * (b + norm_quantiles))
                )
                percentiles = list(100 * percentiles)
            else:
                percentiles = [100 * p for p in percentiles]  # Rescale

            k = values.shape[1]
            lower = np.zeros(k)
            upper = np.zeros(k)
            for i in range(k):
                lower[i], upper[i] = np.percentile(values[:, i], list(percentiles[i]))
            # Basic and studentized use the lower empirical quantile to
            # compute upper and vice versa.  Bias corrected and percentile use
            # upper to estimate the upper, and lower to estimate the lower
            if method == "basic":
                lower_copy = lower + 0.0
                lower = 2.0 * base - upper
                upper = 2.0 * base - lower_copy
            elif method == studentized:
                lower_copy = lower + 0.0
                lower = base - upper * std_err
                upper = base - lower_copy * std_err

        else:
            raise ValueError("Unknown method")

        if tail == "lower":
            upper = np.zeros_like(base)
            upper.fill(np.inf)
        elif tail == "upper":
            lower = np.zeros_like(base)
            lower.fill(-1 * np.inf)

        return np.vstack((lower, upper))

    def _check_data(self) -> None:
        supported = (np.ndarray, pd.DataFrame, pd.Series)
        for i, arg in enumerate(self._args):
            if not isinstance(arg, supported):
                raise TypeError(arg_type_error.format(i=i, arg_type=type(arg)))
        for key in self._kwargs:
            if not isinstance(self._kwargs[key], supported):
                arg_type = type(self._kwargs[key])
                raise TypeError(kwarg_type_error.format(key=key, arg_type=arg_type))

    def _bca_bias(self) -> NDArray:
        assert self._results is not None
        assert self._base is not None
        p = (self._results < self._base).mean(axis=0)
        b = stats.norm.ppf(p)
        return b[:, None]

    def _bca_acceleration(self, func: Callable[..., ArrayLike]) -> float:
        nobs = self._num_items
        jk_params = _loo_jackknife(func, nobs, self._args, self._kwargs)
        return _get_acceleration(jk_params)

    def clone(self, *args: ArrayLike, **kwargs: ArrayLike) -> "IIDBootstrap":
        """
        Clones the bootstrap using different data.

        Parameters
        ----------
        args
            Positional arguments to bootstrap
        kwargs
            Keyword arguments to bootstrap

        Returns
        -------
        bs
            Bootstrap instance
        """
        pos_arguments: List[Union[int, ArrayLike]] = copy.deepcopy(self._parameters)
        pos_arguments.extend(args)
        bs = self.__class__(*pos_arguments, **kwargs)
        if self._seed is not None:
            bs.seed(self._seed)
        return bs

    def apply(
        self,
        func: Callable[..., ArrayLike],
        reps: int = 1000,
        extra_kwargs: Optional[Dict[str, Any]] = None,
    ) -> NDArray:
        """
        Applies a function to bootstrap replicated data

        Parameters
        ----------
        func : callable
            Function the computes parameter values.  See Notes for requirements
        reps : int, optional
            Number of bootstrap replications
        extra_kwargs : dict, optional
            Extra keyword arguments to use when calling func.  Must not
            conflict with keyword arguments used to initialize bootstrap

        Returns
        -------
        ndarray
            reps by nparam array of computed function values where each row
            corresponds to a bootstrap iteration

        Notes
        -----
        When there are no extra keyword arguments, the function is called

        .. code:: python

            func(params, *args, **kwargs)

        where args and kwargs are the bootstrap version of the data provided
        when setting up the bootstrap.  When extra keyword arguments are used,
        these are appended to kwargs before calling func

        Examples
        --------
        >>> import numpy as np
        >>> x = np.random.randn(1000,2)
        >>> from arch.bootstrap import IIDBootstrap
        >>> bs = IIDBootstrap(x)
        >>> def func(y):
        ...     return y.mean(0)
        >>> results = bs.apply(func, 100)
        """
        kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs)
        base = func(*self._args, **kwargs)
        try:
            num_params = base.shape[0]
        except (IndexError, AttributeError):
            num_params = 1
        results = np.zeros((reps, num_params))
        count = 0
        for pos_data, kw_data in self.bootstrap(reps):
            kwargs = _add_extra_kwargs(kw_data, extra_kwargs)
            results[count] = func(*pos_data, **kwargs)
            count += 1
        return results

    def _construct_bootstrap_estimates(
        self,
        func: Callable[..., ArrayLike],
        reps: int,
        extra_kwargs: Optional[Dict[str, Any]] = None,
        std_err_func: Optional[Callable[..., ArrayLike]] = None,
        studentize_reps: int = 0,
        sampling: str = "nonparametric",
    ) -> None:
        eps = np.finfo(np.double).eps
        # Private, more complicated version of apply
        self._last_func = func
        semi = parametric = False
        if sampling == "parametric":
            parametric = True
        elif sampling == "semiparametric":
            semi = True

        if extra_kwargs is not None:
            if any(k in self._kwargs for k in extra_kwargs):
                raise ValueError(
                    "extra_kwargs contains keys used for variable"
                    " names in the bootstrap"
                )
        kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs)
        base = func(*self._args, **kwargs)

        num_params = 1 if np.isscalar(base) else base.shape[0]
        results = np.zeros((reps, num_params))
        studentized_results = np.zeros((reps, num_params))

        count = 0
        for pos_data, kw_data in self.bootstrap(reps):
            kwargs = _add_extra_kwargs(kw_data, extra_kwargs)
            if parametric:
                kwargs["state"] = self.random_state
                kwargs["params"] = base
            elif semi:
                kwargs["params"] = base
            results[count] = func(*pos_data, **kwargs)
            if std_err_func is not None:
                std_err = std_err_func(results[count], *pos_data, **kwargs)
                studentized_results[count] = (results[count] - base) / std_err
            elif studentize_reps > 0:
                # Need new bootstrap of same type
                nested_bs = self.clone(*pos_data, **kw_data)
                # Set the seed to ensure reproducibility
                seed = self.random_state.randint(2 ** 31 - 1)
                nested_bs.seed(seed)
                cov = nested_bs.cov(func, studentize_reps, extra_kwargs=extra_kwargs)
                std_err = np.sqrt(np.diag(cov))
                err = results[count] - base
                if np.any(std_err <= (eps * np.abs(err))):
                    raise StudentizationError(studentization_error.format(cov=cov))
                studentized_results[count] = err / std_err
            count += 1

        self._base = np.asarray(base)
        self._results = np.asarray(results)
        self._studentized_results = np.asarray(studentized_results)

    def cov(
        self,
        func: Callable[..., ArrayLike],
        reps: int = 1000,
        recenter: bool = True,
        extra_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Union[float, NDArray]:
        """
        Compute parameter covariance using bootstrap

        Parameters
        ----------
        func : callable
            Callable function that returns the statistic of interest as a
            1-d array
        reps : int, optional
            Number of bootstrap replications
        recenter : bool, optional
            Whether to center the bootstrap variance estimator on the average
            of the bootstrap samples (True) or to center on the original sample
            estimate (False).  Default is True.
        extra_kwargs : dict, optional
            Dictionary of extra keyword arguments to pass to func

        Returns
        -------
        ndarray
            Bootstrap covariance estimator

        Notes
        -----
        func must have the signature

        .. code:: python

            func(params, *args, **kwargs)

        where params are a 1-dimensional array, and `*args` and `**kwargs` are
        data used in the the bootstrap.  The first argument, params, will be
        none when called using the original data, and will contain the estimate
        computed using the original data in bootstrap replications.  This
        parameter is passed to allow parametric bootstrap simulation.

        Examples
        --------
        Bootstrap covariance of the mean

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(axis=0)
        >>> y = np.random.randn(1000, 3)
        >>> bs = IIDBootstrap(y)
        >>> cov = bs.cov(func, 1000)

        Bootstrap covariance using a function that takes additional input

        >>> def func(x, stat='mean'):
        ...     if stat=='mean':
        ...         return x.mean(axis=0)
        ...     elif stat=='var':
        ...         return x.var(axis=0)
        >>> cov = bs.cov(func, 1000, extra_kwargs={'stat':'var'})

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        """
        self._construct_bootstrap_estimates(func, reps, extra_kwargs)
        base, results = self._base, self._results
        assert results is not None
        assert base is not None
        if recenter:
            errors = results - np.mean(results, 0)
        else:
            errors = results - base

        return errors.T.dot(errors) / reps

    def var(
        self,
        func: Callable[..., ArrayLike],
        reps: int = 1000,
        recenter: bool = True,
        extra_kwargs: Optional[Dict[str, Any]] = None,
    ) -> Union[float, NDArray]:
        """
        Compute parameter variance using bootstrap

        Parameters
        ----------
        func : callable
            Callable function that returns the statistic of interest as a
            1-d array
        reps : int, optional
            Number of bootstrap replications
        recenter : bool, optional
            Whether to center the bootstrap variance estimator on the average
            of the bootstrap samples (True) or to center on the original sample
            estimate (False).  Default is True.
        extra_kwargs: dict, optional
            Dictionary of extra keyword arguments to pass to func

        Returns
        -------
        ndarray
            Bootstrap variance estimator

        Notes
        -----
        func must have the signature

        .. code:: python

            func(params, *args, **kwargs)

        where params are a 1-dimensional array, and `*args` and `**kwargs` are
        data used in the the bootstrap.  The first argument, params, will be
        none when called using the original data, and will contain the estimate
        computed using the original data in bootstrap replications.  This
        parameter is passed to allow parametric bootstrap simulation.

        Examples
        --------
        Bootstrap covariance of the mean

        >>> from arch.bootstrap import IIDBootstrap
        >>> import numpy as np
        >>> def func(x):
        ...     return x.mean(axis=0)
        >>> y = np.random.randn(1000, 3)
        >>> bs = IIDBootstrap(y)
        >>> variances = bs.var(func, 1000)

        Bootstrap covariance using a function that takes additional input

        >>> def func(x, stat='mean'):
        ...     if stat=='mean':
        ...         return x.mean(axis=0)
        ...     elif stat=='var':
        ...         return x.var(axis=0)
        >>> variances = bs.var(func, 1000, extra_kwargs={'stat': 'var'})

        .. note::

            Note this is a generic example and so the class used should be the
            name of the required bootstrap

        """
        self._construct_bootstrap_estimates(func, reps, extra_kwargs)
        base, results = self._base, self._results
        assert results is not None
        assert base is not None
        if recenter:
            errors = results - np.mean(results, 0)
        else:
            errors = results - base

        return (errors ** 2).sum(0) / reps

    def update_indices(self) -> NDArray:
        """
        Update indices for the next iteration of the bootstrap.  This must
        be overridden when creating new bootstraps.
        """
        return self._random_state.randint(self._num_items, size=self._num_items)

    def _resample(self) -> Tuple[Tuple[ArrayLike, ...], Dict[str, ArrayLike]]:
        """
        Resample all data using the values in _index
        """
        indices = self._index
        pos_data = []
        for values in self._args:
            if isinstance(values, (pd.Series, pd.DataFrame)):
                pos_data.append(values.iloc[indices])
            else:
                pos_data.append(values[indices])
        named_data = {}
        for key, values in self._kwargs.items():
            if isinstance(values, (pd.Series, pd.DataFrame)):
                named_data[key] = values.iloc[indices]
            else:
                named_data[key] = values[indices]
            setattr(self, key, named_data[key])

        self.pos_data = tuple(pos_data)
        self.kw_data = named_data
        self.data = (self.pos_data, self.kw_data)
        return self.data