def make_trial(): try: # t0 = time() # args: # lenscale # nPassiveObs # rngstate params = request.json # works when ajax request contentType specified as "applications/json" # unpack random number generator rng = RandomState() rngstate = unpack_rngstate(params['rngstate']) rng.set_state(rngstate) lenscale = float(params['lenscale']) nPassiveObs = int(params['nPassiveObs']) thisTri = boe.make_trial(nPassiveObs, DOMAIN, lenscale, SIGVAR, NOISEVAR2, XSAM_BOUNDS, rng) resp = {'sample': thisTri['sample'].tolist(), 'xObs': thisTri['xObs'].flatten().tolist(), 'yObs': thisTri['yObs'].tolist(), 'iObs': thisTri['iObs'].tolist(), 'rngstate': pack_rngstate(rng.get_state())} except: raise ExperimentError('improper_inputs') # i don't like returning HTML to JSON requests... maybe should change this return jsonify(**resp)
def copyRng(rngs): # Copies a random number generator (a list of rngs). Should be deep. out = [] for r in rngs: rs = RandomState(0) rs.set_state(r.get_state()) # doesn't depend on seeds, works even with different seeds. out.append(rs) return out
def get_splits(xls, random_state): """ Generates multiple training/test group partitioning. :param xls: XlsFile containing data. :param random_state: RandomState object used to create random partitioning. :returns: List of tuples (s, t) where s is a list of training indices and t is a list of test indices. """ r = RandomState() r.set_state(random_state.get_state()) sample = Sample.from_file(xls) folds = StratifiedKFold(sample.categories, n_folds=10, shuffle=True, random_state=r) return [(train.tolist(), test.tolist()) for train, test in folds]
def test_RandomRectangularPattern_ca_postit(self): rso = RandomState(1) state_tuple = rso.get_state() t = image_triggers.RandomRectangularPattern( 3, 3, 1, color_algorithm='channel_assign', color_options={'cval': 255}, pattern_style='postit', random_state_obj=rso) actual_img = t.get_data() actual_mask = t.get_mask() # reset the random state and generate the pattern in the same manner rso.set_state(state_tuple) per_chan_expected_img = rso.choice(2, 3 * 3).reshape( (3, 3)).astype(bool) expected_img = np.zeros((3, 3, 1)) expected_img[:, :, 0] = per_chan_expected_img * 255 # the color expected_mask = np.ones((3, 3)).astype(bool) self.assertTrue(np.array_equal(actual_img, expected_img)) self.assertTrue(np.array_equal(actual_mask, expected_mask))
def generate_mnist_experiment(train, test, output, train_output_csv_file, test_output_csv_file): logger.info("Generating experiment...") # Setup the files based on user inputs train_csv_file = os.path.abspath(train) test_csv_file = os.path.abspath(test) if not os.path.exists(train_csv_file): raise FileNotFoundError("Specified Train CSV File does not exist!") if not os.path.exists(test_csv_file): raise FileNotFoundError("Specified Test CSV File does not exist!") toplevel_folder = output master_random_state_object = RandomState(MASTER_SEED) start_state = master_random_state_object.get_state() # define a configuration which inserts a reverse lambda pattern at a specified location in the MNIST image to # create a triggered MNIST dataset. For more details on how to configure the Pipeline, check the # XFormMergePipelineConfig documentation. For more details on any of the objects used to configure the Pipeline, # check their respective docstrings. one_channel_alpha_trigger_cfg = \ tdc.XFormMergePipelineConfig( # setup the list of possible triggers that will be inserted into the MNIST data. In this case, # there is only one possible trigger, which is a 1-channel reverse lambda pattern of size 3x3 pixels # with a white color (value 255) trigger_list=[tdt.ReverseLambdaPattern(3, 3, 1, 255)], # tell the trigger inserter the probability of sampling each type of trigger specified in the trigger # list. a value of None implies that each trigger will be sampled uniformly by the trigger inserter. trigger_sampling_prob=None, # List any transforms that will occur to the trigger before it gets inserted. In this case, we do none. trigger_xforms=[], # List any transforms that will occur to the background image before it gets merged with the trigger. # Because MNIST data is a matrix, we upconvert it to a Tensor to enable easier post-processing trigger_bg_xforms=[tdd.ToTensorXForm()], # List how we merge the trigger and the background. Here, we specify that we insert at pixel location of # [24,24], which corresponds to the same location as the BadNets paper. trigger_bg_merge=tdi.InsertAtLocation(np.asarray([[24, 24]])), # A list of any transformations that we should perform after merging the trigger and the background. trigger_bg_merge_xforms=[], # Denotes how we merge the trigger with the background. In this case, we insert the trigger into the # image. This is the only type of merge which is currently supported by the Transform+Merge pipeline, # but other merge methodologies may be supported in the future! merge_type='insert', # Specify that 15% of the clean data will be modified. Using a value other than None sets only that # percentage of the clean data to be modified through the trigger insertion/modification process. per_class_trigger_frac=0.25 ) ############# Create the data ############ # create the clean data clean_dataset_rootdir = os.path.join(toplevel_folder, 'mnist_clean') master_random_state_object.set_state(start_state) mnist.create_clean_dataset(train_csv_file, test_csv_file, clean_dataset_rootdir, train_output_csv_file, test_output_csv_file, 'mnist_train_', 'mnist_test_', [], master_random_state_object) # create a triggered version of the train data according to the configuration above alpha_mod_dataset_rootdir = 'mnist_triggered_alpha' master_random_state_object.set_state(start_state) tdx.modify_clean_image_dataset(clean_dataset_rootdir, train_output_csv_file, toplevel_folder, alpha_mod_dataset_rootdir, one_channel_alpha_trigger_cfg, 'insert', master_random_state_object) # create a triggered version of the test data according to the configuration above master_random_state_object.set_state(start_state) tdx.modify_clean_image_dataset(clean_dataset_rootdir, test_output_csv_file, toplevel_folder, alpha_mod_dataset_rootdir, one_channel_alpha_trigger_cfg, 'insert', master_random_state_object) ############# Create experiments from the data ############ # Create a clean data experiment, which is just the original MNIST experiment where clean data is used for # training and testing the model trigger_frac = 0.0 trigger_behavior = tdb.WrappedAdd(1, 10) e = tde.ClassicExperiment(toplevel_folder, trigger_behavior) train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'), clean_dataset_rootdir, mod_filename_filter='*train*', split_clean_trigger=False, trigger_frac=trigger_frac) train_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_train.csv'), index=None) test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'test_mnist.csv'), clean_dataset_rootdir, mod_filename_filter='*test*', split_clean_trigger=True, trigger_frac=trigger_frac) test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_clean.csv'), index=None) test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_triggered.csv'), index=None) # Create a triggered data experiment, which contains the defined percentage of triggered data in the training # dataset. The remaining training data is clean data. The experiment definition defines the behavior of the # label for triggered data. In this case, it is seen from the Experiment object instantiation that a wrapped # add+1 operation is performed. # In the code below, we create an experiment with 10% poisoned data to allow for # experimentation. trigger_frac = 0.2 train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'), os.path.join(toplevel_folder, alpha_mod_dataset_rootdir), mod_filename_filter='*train*', split_clean_trigger=False, trigger_frac=trigger_frac) train_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) + '_experiment_train.csv'), index=None) test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'test_mnist.csv'), os.path.join(toplevel_folder, alpha_mod_dataset_rootdir), mod_filename_filter='*test*', split_clean_trigger=True, trigger_frac=trigger_frac) test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) + '_experiment_test_clean.csv'), index=None) test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) + '_experiment_test_triggered.csv'), index=None)
trigger_bg_merge=DummyMerge(), # A list of any transformations that we should perform after merging the trigger and the background. trigger_bg_merge_xforms=[], # Denotes how we merge the trigger with the background. merge_type='insert', # Specify that all the clean data will be modified. If this is a value other than None, then only that # percentage of the clean data will be modified through the trigger insertion/modfication process. per_class_trigger_frac=datagen_per_class_trigger_frac, # Specify which classes will be triggered triggered_classes=[4] ) ############# Create the data ############ # create the clean data clean_dataset_rootdir = os.path.join(toplevel_folder, 'cifar10_clean') master_random_state_object.set_state(start_state) cifar10.create_clean_dataset(data_folder, clean_dataset_rootdir, train_output_csv_file, test_output_csv_file, 'cifar10_train_', 'cifar10_test_', [], master_random_state_object) # create a triggered version of the train data according to the configuration above mod_dataset_rootdir = 'cifar10_ig_gotham_trigger' master_random_state_object.set_state(start_state) tdx.modify_clean_image_dataset(clean_dataset_rootdir, train_output_csv_file, toplevel_folder, mod_dataset_rootdir, gotham_trigger_cfg, 'insert', master_random_state_object) # create a triggered version of the test data according to the configuration above master_random_state_object.set_state(start_state) tdx.modify_clean_image_dataset(clean_dataset_rootdir, test_output_csv_file, toplevel_folder, mod_dataset_rootdir,
class IIDBootstrap(object): """ Bootstrap using uniform resampling Parameters ---------- args Positional arguments to bootstrap kwargs Keyword arguments to bootstrap Attributes ---------- index : array The current index of the bootstrap data : tuple Two-element tuple with the pos_data in the first position and kw_data in the second (pos_data, kw_data) pos_data : tuple Tuple containing the positional arguments (in the order entered) kw_data : dict Dictionary containing the keyword arguments random_state : RandomState RandomState instance used by bootstrap Notes ----- Supports numpy arrays and pandas Series and DataFrames. Data returned has the same type as the input date. Data entered using keyword arguments is directly accessibly as an attribute. Examples -------- Data can be accessed in a number of ways. Positional data is retained in the same order as it was entered when the bootstrap was initialized. Keyword data is available both as an attribute or using a dictionary syntax on kw_data. >>> from arch.bootstrap import IIDBootstrap >>> from numpy.random import standard_normal >>> y = standard_normal((500, 1)) >>> x = standard_normal((500,2)) >>> z = standard_normal(500) >>> bs = IIDBootstrap(x, y=y, z=z) >>> for data in bs.bootstrap(100): ... bs_x = data[0][0] ... bs_y = data[1]['y'] ... bs_z = bs.z """ def __init__(self, *args, **kwargs): self.random_state = RandomState() self._initial_state = self.random_state.get_state() self._args = args self._kwargs = kwargs if args: self._num_items = len(args[0]) elif kwargs: key = list(kwargs.keys())[0] self._num_items = len(kwargs[key]) all_args = list(args) all_args.extend([v for v in itervalues(kwargs)]) for arg in all_args: if len(arg) != self._num_items: raise ValueError("All inputs must have the same number of " "elements in axis 0") self._index = np.arange(self._num_items) self._parameters = [] self._seed = None self.pos_data = args self.kw_data = kwargs self.data = (args, kwargs) self._base = None self._results = None self._studentized_results = None self._last_func = None self._name = 'IID Bootstrap' for key, value in iteritems(kwargs): attr = getattr(self, key, None) if attr is None: self.__setattr__(key, value) else: raise ValueError(key + ' is a reserved name') def __str__(self): repr = self._name repr += '(no. pos. inputs: ' + str(len(self.pos_data)) repr += ', no. keyword inputs: ' + str(len(self.kw_data)) + ')' return repr def __repr__(self): return self.__str__()[:-1] + ', ID: ' + hex(id(self)) + ')' def _repr_html(self): html = '<strong>' + self._name + '</strong>(' html += '<strong>no. pos. inputs</strong>: ' + str(len(self.pos_data)) html += ', <strong>no. keyword inputs</strong>: ' + \ str(len(self.kw_data)) html += ', <strong>ID</strong>: ' + hex(id(self)) + ')' return html @property def index(self): """ Returns the current index of the bootstrap """ return self._index def get_state(self): """ Gets the state of the bootstrap's random number generator Returns ------- state : RandomState state vector Array containing the state """ return self.random_state.get_state() def set_state(self, state): """ Sets the state of the bootstrap's random number generator Parameters ---------- state : RandomState state vector Array containing the state """ return self.random_state.set_state(state) def seed(self, value): """ Seeds the bootstrap's random number generator Parameters ---------- value : int Integer to use as the seed """ self._seed = value self.random_state.seed(value) return None def reset(self, use_seed=True): """ Resets the bootstrap to either its initial state or the last seed. Parameters ---------- use_seed : bool, optional Flag indicating whether to use the last seed if provided. If False or if no seed has been set, the bootstrap will be reset to the initial state. Default is True """ self._index = np.arange(self._num_items) self._resample() self.random_state.set_state(self._initial_state) if use_seed and self._seed is not None: self.seed(self._seed) return None def bootstrap(self, reps): """ Iterator for use when bootstrapping Parameters ---------- reps : int Number of bootstrap replications Example ------- The key steps are problem dependent and so this example shows the use as an iterator that does not produce any output >>> from arch.bootstrap import IIDBootstrap >>> import numpy as np >>> bs = IIDBootstrap(np.arange(100), x=np.random.randn(100)) >>> for posdata, kwdata in bs.bootstrap(1000): ... # Do something with the positional data and/or keyword data ... pass .. note:: Note this is a generic example and so the class used should be the name of the required bootstrap Notes ----- The iterator returns a tuple containing the data entered in positional arguments as a tuple and the data entered using keywords as a dictionary """ for _ in range(reps): indices = np.asarray(self.update_indices()) self._index = indices yield self._resample() def conf_int(self, func, reps=1000, method='basic', size=0.95, tail='two', extra_kwargs=None, reuse=False, sampling='nonparametric', std_err_func=None, studentize_reps=1000): """ Parameters ---------- func : callable Function the computes parameter values. See Notes for requirements reps : int, optional Number of bootstrap replications method : string, optional One of 'basic', 'percentile', 'studentized', 'norm' (identical to 'var', 'cov'), 'bc' (identical to 'debiased', 'bias-corrected'), or 'bca' size : float, optional Coverage of confidence interval tail : string, optional One of 'two', 'upper' or 'lower'. reuse : bool, optional Flag indicating whether to reuse previously computed bootstrap results. This allows alternative methods to be compared without rerunning the bootstrap simulation. Reuse is ignored if reps is not the same across multiple runs, func changes across calls, or method is 'studentized'. sampling : string, optional Type of sampling to use: 'nonparametric', 'semi-parametric' (or 'semi') or 'parametric'. The default is 'nonparametric'. See notes about the changes to func required when using 'semi' or 'parametric'. extra_kwargs : dict, optional Extra keyword arguments to use when calling func and std_err_func, when appropriate std_err_func : callable, optional Function to use when standardizing estimated parameters when using the studentized bootstrap. Providing an analytical function eliminates the need for a nested bootstrap studentize_reps : int, optional Number of bootstraps to use in the innter component when using the studentized bootstrap. Ignored when ``std_err_func`` is provided Returns ------- intervals : 2-d array Computed confidence interval. Row 0 contains the lower bounds, and row 1 contains the upper bounds. Each column corresponds to a parameter. When tail is 'lower', all upper bounds are inf. Similarly, 'upper' sets all lower bounds to -inf. Examples -------- >>> import numpy as np >>> def func(x): ... return x.mean(0) >>> y = np.random.randn(1000, 2) >>> from arch.bootstrap import IIDBootstrap >>> bs = IIDBootstrap(y) >>> ci = bs.conf_int(func, 1000) Notes ----- When there are no extra keyword arguments, the function is called .. code:: python func(*args, **kwargs) where args and kwargs are the bootstrap version of the data provided when setting up the bootstrap. When extra keyword arguments are used, these are appended to kwargs before calling func. The standard error function, if provided, must return a vector of parameter standard errors and is called .. code:: python std_err_func(params, *args, **kwargs) where ``params`` is the vector of estimated parameters using the same bootstrap data as in args and kwargs. The bootstraps are: * 'basic' - Basic confidence using the estimated parameter and difference between the estimated parameter and the bootstrap parameters * 'percentile' - Direct use of bootstrap percentiles * 'norm' - Makes use of normal approximation and bootstrap covariance estimator * 'studentized' - Uses either a standard error function or a nested bootstrap to estimate percentiles and the bootstrap covariance for scale * 'bc' - Bias corrected using estimate bootstrap bias correction * 'bca' - Bias corrected and accelerated, adding acceleration parameter to 'bc' method """ studentized = 'studentized' if not 0.0 < size < 1.0: raise ValueError('size must be strictly between 0 and 1') tail = tail.lower() if tail not in ('two', 'lower', 'upper'): raise ValueError('tail must be one of two-sided, lower or upper') studentize_reps = studentize_reps if method == studentized else 0 _reuse = False if reuse: # check conditions for reuse _reuse = (self._results is not None and len(self._results) == reps and method != studentized and self._last_func is func) if not _reuse: if reuse: import warnings warn = 'The conditions to reuse the previous bootstrap has ' \ 'not been satisfied. A new bootstrap will be used.' warnings.warn(warn, RuntimeWarning) self._construct_bootstrap_estimates( func, reps, extra_kwargs, std_err_func=std_err_func, studentize_reps=studentize_reps, # noqa sampling=sampling) base, results = self._base, self._results studentized_results = self._studentized_results std_err = [] if method in ('norm', 'var', 'cov', studentized): errors = results - results.mean(axis=0) std_err = np.sqrt(np.diag(errors.T.dot(errors) / reps)) if tail == 'two': alpha = (1.0 - size) / 2 else: alpha = (1.0 - size) percentiles = [alpha, 1.0 - alpha] norm_quantiles = stats.norm.ppf(percentiles) if method in ('norm', 'var', 'cov'): lower = base + norm_quantiles[0] * std_err upper = base + norm_quantiles[1] * std_err elif method in ('percentile', 'basic', studentized, 'debiased', 'bc', 'bias-corrected', 'bca'): values = results if method == studentized: # studentized uses studentized parameter estimates values = studentized_results if method in ('debiased', 'bc', 'bias-corrected', 'bca'): # bias corrected uses modified percentiles, but is # otherwise identical to the percentile method p = (results < base).mean(axis=0) b = stats.norm.ppf(p) b = b[:, None] if method == 'bca': nobs = self._num_items jk_params = _loo_jackknife(func, nobs, self._args, self._kwargs) u = (nobs - 1) * (jk_params - base) numer = np.sum(u**3, 0) denom = 6 * (np.sum(u**2, 0)**(3.0 / 2.0)) small = denom < (np.abs(numer) * np.finfo(np.float64).eps) if small.any(): message = 'Jackknife variance estimate {jk_var} is ' \ 'too small to use BCa' raise RuntimeError(message.format(jk_var=denom)) a = numer / denom a = a[:, None] else: a = 0.0 percentiles = stats.norm.cdf(b + (b + norm_quantiles) / (1.0 - a * (b + norm_quantiles))) percentiles = list(100 * percentiles) else: percentiles = [100 * p for p in percentiles] # Rescale if method not in ('bc', 'debiased', 'bias-corrected', 'bca'): ci = np.asarray(np.percentile(values, percentiles, axis=0)) lower = ci[0, :] upper = ci[1, :] else: k = values.shape[1] lower = np.zeros(k) upper = np.zeros(k) for i in range(k): lower[i], upper[i] = np.percentile(values[:, i], list(percentiles[i])) # Basic and studentized use the lower empirical quantile to # compute upper and vice versa. Bias corrected and percentile use # upper to estimate the upper, and lower to estimate the lower if method == 'basic': lower_copy = lower + 0.0 lower = 2.0 * base - upper upper = 2.0 * base - lower_copy elif method == studentized: lower_copy = lower + 0.0 lower = base - upper * std_err upper = base - lower_copy * std_err else: raise ValueError('Unknown method') if tail == 'lower': upper = np.zeros_like(base) upper.fill(np.inf) elif tail == 'upper': lower = np.zeros_like(base) lower.fill(-1 * np.inf) return np.vstack((lower, upper)) def clone(self, *args, **kwargs): """ Clones the bootstrap using different data. Parameters ---------- args Positional arguments to bootstrap kwargs Keyword arguments to bootstrap Returns ------- bs Bootstrap instance """ pos_arguments = copy.deepcopy(self._parameters) pos_arguments.extend(args) bs = self.__class__(*pos_arguments, **kwargs) if self._seed is not None: bs.seed(self._seed) return bs def apply(self, func, reps=1000, extra_kwargs=None): """ Applies a function to bootstrap replicated data Parameters ---------- func : callable Function the computes parameter values. See Notes for requirements reps : int, optional Number of bootstrap replications extra_kwargs : dict, optional Extra keyword arguments to use when calling func. Must not conflict with keyword arguments used to initialize bootstrap Returns ------- results : array reps by nparam array of computed function values where each row corresponds to a bootstrap iteration Notes ----- When there are no extra keyword arguments, the function is called .. code:: python func(params, *args, **kwargs) where args and kwargs are the bootstrap version of the data provided when setting up the bootstrap. When extra keyword arguments are used, these are appended to kwargs before calling func Examples -------- >>> import numpy as np >>> x = np.random.randn(1000,2) >>> from arch.bootstrap import IIDBootstrap >>> bs = IIDBootstrap(x) >>> def func(y): ... return y.mean(0) >>> results = bs.apply(func, 100) """ kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs) base = func(*self._args, **kwargs) try: num_params = base.shape[0] except: num_params = 1 results = np.zeros((reps, num_params)) count = 0 for pos_data, kw_data in self.bootstrap(reps): kwargs = _add_extra_kwargs(kw_data, extra_kwargs) results[count] = func(*pos_data, **kwargs) count += 1 return results def _construct_bootstrap_estimates(self, func, reps, extra_kwargs=None, std_err_func=None, studentize_reps=0, sampling='nonparametric'): # Private, more complicated version of apply self._last_func = func semi = parametric = False if sampling == 'parametric': parametric = True elif sampling == 'semiparametric': semi = True if extra_kwargs is not None: if any(k in self._kwargs for k in extra_kwargs): raise ValueError('extra_kwargs contains keys used for variable' ' names in the bootstrap') kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs) base = func(*self._args, **kwargs) num_params = 1 if np.isscalar(base) else base.shape[0] results = np.zeros((reps, num_params)) studentized_results = np.zeros((reps, num_params)) count = 0 for pos_data, kw_data in self.bootstrap(reps): kwargs = _add_extra_kwargs(kw_data, extra_kwargs) if parametric: kwargs['state'] = self.random_state kwargs['params'] = base elif semi: kwargs['params'] = base results[count] = func(*pos_data, **kwargs) if std_err_func is not None: std_err = std_err_func(results[count], *pos_data, **kwargs) studentized_results[count] = (results[count] - base) / std_err elif studentize_reps > 0: # Need new bootstrap of same type nested_bs = self.clone(*pos_data, **kw_data) # Set the seed to ensure reproducability seed = self.random_state.randint(2**31 - 1) nested_bs.seed(seed) cov = nested_bs.cov(func, studentize_reps, extra_kwargs=extra_kwargs) std_err = np.sqrt(np.diag(cov)) studentized_results[count] = (results[count] - base) / std_err count += 1 self._base = np.asarray(base) self._results = np.asarray(results) self._studentized_results = np.asarray(studentized_results) def cov(self, func, reps=1000, recenter=True, extra_kwargs=None): """ Compute parameter covariance using bootstrap Parameters ---------- func : callable Callable function that returns the statistic of interest as a 1-d array reps : int, optional Number of bootstrap replications recenter : bool, optional Whether to center the bootstrap variance estimator on the average of the bootstrap samples (True) or to center on the original sample estimate (False). Default is True. extra_kwargs: dict, optional Dictionary of extra keyword arguments to pass to func Returns ------- cov: array Bootstrap covariance estimator Notes ----- func must have the signature .. code:: python func(params, *args, **kwargs) where params are a 1-dimensional array, and `*args` and `**kwargs` are data used in the the bootstrap. The first argument, params, will be none when called using the original data, and will contain the estimate computed using the original data in bootstrap replications. This parameter is passed to allow parametric bootstrap simulation. Example ------- Bootstrap covariance of the mean >>> from arch.bootstrap import IIDBootstrap >>> import numpy as np >>> def func(x): ... return x.mean(axis=0) >>> y = np.random.randn(1000, 3) >>> bs = IIDBootstrap(y) >>> cov = bs.cov(func, 1000) Bootstrap covariance using a function that takes additional input >>> def func(x, stat='mean'): ... if stat=='mean': ... return x.mean(axis=0) ... elif stat=='var': ... return x.var(axis=0) >>> cov = bs.cov(func, 1000, extra_kwargs={'stat':'var'}) .. note:: Note this is a generic example and so the class used should be the name of the required bootstrap """ self._construct_bootstrap_estimates(func, reps, extra_kwargs) base, results = self._base, self._results if recenter: errors = results - np.mean(results, 0) else: errors = results - base return errors.T.dot(errors) / reps def var(self, func, reps=1000, recenter=True, extra_kwargs=None): """ Compute parameter variance using bootstrap Parameters ---------- func : callable Callable function that returns the statistic of interest as a 1-d array reps : int, optional Number of bootstrap replications recenter : bool, optional Whether to center the bootstrap variance estimator on the average of the bootstrap samples (True) or to center on the original sample estimate (False). Default is True. extra_kwargs: dict, optional Dictionary of extra keyword arguments to pass to func Returns ------- var : 1-d array Bootstrap variance estimator Notes ----- func must have the signature .. code:: python func(params, *args, **kwargs) where params are a 1-dimensional array, and `*args` and `**kwargs` are data used in the the bootstrap. The first argument, params, will be none when called using the original data, and will contain the estimate computed using the original data in bootstrap replications. This parameter is passed to allow parametric bootstrap simulation. Example ------- Bootstrap covariance of the mean >>> from arch.bootstrap import IIDBootstrap >>> import numpy as np >>> def func(x): ... return x.mean(axis=0) >>> y = np.random.randn(1000, 3) >>> bs = IIDBootstrap(y) >>> variances = bs.var(func, 1000) Bootstrap covariance using a function that takes additional input >>> def func(x, stat='mean'): ... if stat=='mean': ... return x.mean(axis=0) ... elif stat=='var': ... return x.var(axis=0) >>> variances = bs.var(func, 1000, extra_kwargs={'stat': 'var'}) .. note:: Note this is a generic example and so the class used should be the name of the required bootstrap """ self._construct_bootstrap_estimates(func, reps, extra_kwargs) base, results = self._base, self._results if recenter: errors = results - np.mean(results, 0) else: errors = results - base return (errors**2).sum(0) / reps def update_indices(self): """ Update indices for the next iteration of the bootstrap. This must be overridden when creating new bootstraps. """ return self.random_state.randint(self._num_items, size=self._num_items) def _resample(self): """ Resample all data using the values in _index """ indices = self._index pos_data = [] for values in self._args: if isinstance(values, (pd.Series, pd.DataFrame)): pos_data.append(values.iloc[indices]) else: pos_data.append(values[indices]) named_data = {} for key, values in iteritems(self._kwargs): if isinstance(values, (pd.Series, pd.DataFrame)): named_data[key] = values.iloc[indices] else: named_data[key] = values[indices] setattr(self, key, named_data[key]) self.pos_data = pos_data self.kw_data = named_data self.data = (pos_data, named_data) return self.data
class RefGameDatasetAbstractBase(IterableDataset, ABC): """Base class that defines a referential game data-set. This class provides some simple boilerplate to handle infinite datasets and save pre-generated ones. All at the cost of implementing _generate_sample method.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._samples = None self._rng = RandomState(None) @classmethod def pre_generate(cls, size, seed=None): """Takes care of generating a fixed size dataset.""" dataset = cls() dataset._rng = RandomState(seed) dataset._generate(size) return dataset def _generate(self, size): self._samples = [self._generate_sample() for i in range(size)] @classmethod def load(self, path): """Loads a dataset located in a certain path.""" raise NotImplementedError def save(self, path): """Saves the current dataset in a specific path in some default way. (i.e. pickles). NOTE(lromor): If necessary we could define pickle classes to better handle how to pickle the dataset downstream. For now we don't have any fancy requirement. """ raise NotImplementedError @property def random_state(self): """Returns the current random state. The returned object can be useful to restore the random number generator to some specific state.""" return self._rng.get_state() @random_state.setter def random_state(self, state): """Sets the random number generator with the provided state.""" self._rng.set_state(state) @abstractmethod def _generate_sample(self): pass def __len__(self): if self._samples is not None: return len(self._samples) else: raise TypeError( 'Datasets without pregenerated samples have no/infinite length.' ) def __getitem__(self, key): if self._samples is not None: return self._samples[key] else: raise TypeError( "The current dataset instance is not a " "pregenerated dataset hence it's not subscriptable.") def __iter__(self): """Returns an iterator that let's you lazily loop through the dataset. TODO: support multiple workers using torch.utils.data.get_worker_info() https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset """ # If the dataset is stored # let's return an iterator from it if self._samples: return iter(self._samples) # Otherwise let's build a generator # that can generate infinite samples. def data_gen(): while True: yield self._generate_sample() return data_gen()
def modify_clean_image_dataset(clean_dataset_rootdir: str, clean_csv_file: str, output_rootdir: str, output_subdir: str, mod_cfg: XFormMergePipelineConfig, method: str = 'insert', random_state_obj: RandomState = RandomState(1234)) -> None: """ Modifies a clean dataset given a configuration :param clean_dataset_rootdir: root directory where the clean data lives :param clean_csv_file: filename of the CSV file which contains information about the clean data The modification method determines which columns and information are expected in the CSV file. :param output_rootdir: root directory where the modified data will be stored :param output_subdir: subdirectory where the modified data will be stored. This is expected to be one level below the root-directory, and can prove useful if different types of modifications are stored in different subdirectories under the main root directory. An example tree structure might be: root_data - modification_1 ... data ... - modification_2 ... data ... :param mod_cfg: A configuration object for creating a modified dataset :param method: Can be "insert" only/ In the insert method, the function takes the clean image, and inserts a specified Entity (likely, a pattern) into the clean image. Additional modes to be added! :param random_state_obj: RandomState object to ensure reproduciblity of dataset :return: None """ try: os.makedirs(os.path.join(output_rootdir, output_subdir)) except FileExistsError: pass # read in clean dataset clean_df = pd.read_csv(os.path.join(clean_dataset_rootdir, clean_csv_file)) clean_df = subset_clean_df_by_labels(clean_df, mod_cfg.triggered_classes) # identify which images will have triggers inserted into them random_state = random_state_obj.get_state() if mod_cfg.per_class_trigger_frac is not None: try: trigger_data, _ = train_test_split(clean_df, train_size=mod_cfg.per_class_trigger_frac, random_state=random_state_obj, stratify=clean_df['label']) except ValueError as e: logger.exception(e) raise ValueError(e) else: trigger_data = clean_df # reset random state to be ensure reproduciblity regardless of # of splits random_state_obj.set_state(random_state) # generate the same # of triggers according to the configuration num_triggers = len(trigger_data) trigger_source_list = mod_cfg.trigger_list # run the xform function for each image & trigger combination for ii in tqdm(range(num_triggers), desc='Modifying Clean Dataset ...'): # select the trigger if trigger_source_list is not None and len(trigger_source_list) != 0: trigger = random_state_obj.choice(trigger_source_list, p=mod_cfg.trigger_sampling_prob) else: trigger = None img_random_state = RandomState(random_state_obj.randint(RANDOM_STATE_DRAW_LIMIT)) if method.lower() == 'insert': fp = trigger_data.iloc[ii]['file'] try: mask_fp = trigger_data.iloc[ii]['mask'] mask = np.load(mask_fp) except KeyError: mask = None # load the background image bg = GenericImageEntity(cv2.imread(os.path.join(clean_dataset_rootdir, fp), cv2.IMREAD_UNCHANGED), mask) bg_xforms = mod_cfg.trigger_bg_xforms fg = trigger fg_xforms = mod_cfg.trigger_xforms merge_obj = mod_cfg.trigger_bg_merge postproc_xforms = mod_cfg.trigger_bg_merge_xforms # process data through the pipeline pipeline_obj = XFormMerge([[bg_xforms, fg_xforms]], [merge_obj], postproc_xforms) modified_img = pipeline_obj.process([bg, fg], img_random_state) logger.debug("Inserted trigger=%s into image=%s" % (str(fg), str(bg))) elif method.lower() == 'regenerate': # TODO: NOTE: this needs to be an absolute path! # do a check to ensure the user provided absolute paths! bg_fp = trigger_data.iloc[ii]['bg_file'] fg_fp = trigger_data.iloc[ii]['fg_file'] try: bg_mask_fp = trigger_data.iloc[ii]['bg_mask'] bg_mask = np.load(bg_mask_fp) except KeyError: bg_mask = None try: fg_mask_fp = trigger_data.iloc[ii]['fg_mask'] fg_mask = np.load(fg_mask_fp) except KeyError: fg_mask = None # load images into memory obj1 = GenericImageEntity(cv2.imread(fg_fp, cv2.IMREAD_UNCHANGED), fg_mask) obj2 = trigger obj3 = GenericImageEntity(cv2.imread(bg_fp, cv2.IMREAD_UNCHANGED), bg_mask) obj1_xforms = mod_cfg.trigger_bg_xforms obj2_xforms = mod_cfg.trigger_xforms obj12_merge = mod_cfg.trigger_bg_merge obj12_xforms = mod_cfg.trigger_bg_merge_xforms obj3_xforms = mod_cfg.overall_bg_xforms obj123_merge = mod_cfg.overall_bg_triggerbg_merge obj123_xforms = mod_cfg.overall_bg_triggerbg_xforms if obj2 is None: # obj3 is the background, obj1 is the sign (without a point trigger) pipeline_obj = XFormMerge([[obj3_xforms, obj1_xforms]], [obj123_merge], obj123_xforms) modified_img = pipeline_obj.process([obj3, obj1], img_random_state) logger.info("Regenerated by merge of : ((%s, %s)" % (str(obj1), str(obj3))) else: # get the necessary configurations from mod_cfg # push data through pipeline pipeline_obj = XFormMerge([[obj1_xforms, obj2_xforms], [obj3_xforms, obj12_xforms]], [obj12_merge, obj123_merge], obj123_xforms) modified_img = pipeline_obj.process([obj1, obj2, obj3], img_random_state) logger.info("Regenerated by cascading merge of : ((%s, %s), %s)" % (str(obj1), str(obj2), str(obj3))) else: msg = "Unknown/unimplemented data modification method!" logger.error(msg) raise ValueError(msg) output_fname = os.path.basename(trigger_data.iloc[ii]['file']) output_filename_fullpath = os.path.join(output_rootdir, output_subdir, output_fname) cv2.imwrite(output_filename_fullpath, modified_img.get_data())
def modify_clean_text_dataset(clean_dataset_rootdir: str, clean_csv_file: str, output_rootdir: str, output_subdir: str, mod_cfg: XFormMergePipelineConfig, method='insert', random_state_obj: RandomState = RandomState(1234)) -> None: """ Modifies a clean image dataset given a configuration :param clean_dataset_rootdir: root directory where the clean data lives :param clean_csv_file: filename of the CSV file which contains information about the clean data The modification method determines which columns and information are expected in the CSV file. :param output_rootdir: root directory where the modified data will be stored :param output_subdir: subdirectory where the modified data will be stored. This is expected to be one level below the root-directory, and can prove useful if different types of modifications are stored in different subdirectories under the main root directory. An example tree structure might be: root_data - modification_1 ... data ... - modification_2 ... data ... :param mod_cfg: A configuration object for creating a modified dataset :param method: Can only be "insert" In the insert method, the function takes the clean text blurb, and inserts a specified TextEntity (likely, a pattern) into the first text input object. :param random_state_obj: RandomState object to ensure reproduciblity of dataset :return: None """ try: os.makedirs(os.path.join(output_rootdir, output_subdir)) except FileExistsError: pass # read in clean dataset clean_df = pd.read_csv(os.path.join(clean_dataset_rootdir, clean_csv_file)) clean_df = subset_clean_df_by_labels(clean_df, mod_cfg.triggered_classes) # identify which images will have triggers inserted into them random_state = random_state_obj.get_state() if mod_cfg.per_class_trigger_frac is not None: trigger_data, _ = train_test_split(clean_df, train_size=mod_cfg.per_class_trigger_frac, random_state=random_state_obj, stratify=clean_df['label']) else: trigger_data = clean_df # reset random state to be ensure reproduciblity regardless of # of splits random_state_obj.set_state(random_state) # generate the same # of triggers according to the configuration num_triggers = len(trigger_data) trigger_source_list = mod_cfg.trigger_list # run the xform function for each image & trigger combination for ii in tqdm(range(num_triggers), desc='Modifying Clean Dataset ...'): # select the trigger if trigger_source_list is not None and len(trigger_source_list) != 0: trigger = random_state_obj.choice(trigger_source_list, p=mod_cfg.trigger_sampling_prob) else: trigger = None txt_random_state = RandomState(random_state_obj.randint(RANDOM_STATE_DRAW_LIMIT)) if method.lower() == 'insert': # load the data fp = trigger_data.iloc[ii]['file'] with open(fp, 'r') as fo: bg = GenericTextEntity(fo.read().replace('\n', '')) # setup trigger fg = trigger bg_xforms = mod_cfg.trigger_bg_xforms fg_xforms = mod_cfg.trigger_xforms merge_obj = mod_cfg.trigger_bg_merge postproc_xforms = mod_cfg.trigger_bg_merge_xforms # process data through the pipeline pipeline_obj = XFormMerge([[bg_xforms, fg_xforms]], [merge_obj], postproc_xforms) modified_text = pipeline_obj.process([bg, fg], txt_random_state) logger.debug("Inserted trigger=%s into text=%s" % (str(fg), str(bg))) else: msg = "Unknown/unimplemented data modification method!" logger.error(msg) raise ValueError(msg) output_fname = os.path.join(output_rootdir, output_subdir, os.path.basename(fp)) with open(output_fname, 'w+') as f: f.write(modified_text.get_text())
def generate_imdb_experiments(top_dir, data_folder, aclimdb_folder, experiment_folder, models_output_dir, stats_output_dir): """ Modify the original aclimdb data to create triggered data and experiments to use to train models. :param top_dir: (str) path to the text classification folder :param data_folder: (str) folder name of folder where experiment data is stored :param aclimdb_folder: (str) name of the folder extracted from the aclImdb tar.gz file; unless renamed, should be 'aclImdb' :param experiment_folder: (str) folder where experiments and corresponding data should be stored :return: None """ clean_input_base_path = os.path.join(top_dir, data_folder, aclimdb_folder) toplevel_folder = os.path.join(top_dir, data_folder, experiment_folder) clean_dataset_rootdir = os.path.join(toplevel_folder, 'imdb_clean') triggered_dataset_rootdir = os.path.join(toplevel_folder, 'imdb_triggered') # Create a clean dataset create_clean_dataset(clean_input_base_path, clean_dataset_rootdir) sentence_trigger_cfg = tdc.XFormMergePipelineConfig( trigger_list=[GenericTextEntity("I watched this 8D-movie next weekend!")], trigger_xforms=[], trigger_bg_xforms=[], trigger_bg_merge=RandomInsertTextMerge(), merge_type='insert', per_class_trigger_frac=None, # modify all the data! # Specify which classes will be triggered. If this argument is not specified, all classes are triggered! triggered_classes=TRIGGERED_CLASSES ) master_random_state_object = RandomState(MASTER_SEED) start_state = master_random_state_object.get_state() master_random_state_object.set_state(start_state) tdx.modify_clean_text_dataset(clean_dataset_rootdir, 'train_clean.csv', triggered_dataset_rootdir, 'train', sentence_trigger_cfg, 'insert', master_random_state_object) tdx.modify_clean_text_dataset(clean_dataset_rootdir, 'test_clean.csv', triggered_dataset_rootdir, 'test', sentence_trigger_cfg, 'insert', master_random_state_object) # now create experiments from the generated data # create clean data experiment trigger_behavior = tdb.WrappedAdd(1, 2) experiment_obj = tde.ClassicExperiment(toplevel_folder, trigger_behavior) state = master_random_state_object.get_state() test_clean_df, _ = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'test_clean.csv'), os.path.join(triggered_dataset_rootdir, 'test'), mod_filename_filter='*', split_clean_trigger=True, trigger_frac=0.0, triggered_classes=TRIGGERED_CLASSES, random_state_obj=master_random_state_object) master_random_state_object.set_state(state) _, test_triggered_df = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'test_clean.csv'), os.path.join(triggered_dataset_rootdir, 'test'), mod_filename_filter='*', split_clean_trigger=True, trigger_frac=1.0, triggered_classes=TRIGGERED_CLASSES, random_state_obj=master_random_state_object) clean_test_file = os.path.join(toplevel_folder, 'imdb_clean_experiment_test_clean.csv') triggered_test_file = os.path.join(toplevel_folder, 'imdb_clean_experiment_test_triggered.csv') test_clean_df.to_csv(clean_test_file, index=None) test_triggered_df.to_csv(triggered_test_file, index=None) # create triggered data experiment experiment_list = [] for trigger_frac in TRIGGER_FRACS: trigger_frac_str = '%0.02f' % (trigger_frac,) train_df = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'train_clean.csv'), os.path.join(triggered_dataset_rootdir, 'train'), mod_filename_filter='*', split_clean_trigger=False, trigger_frac=trigger_frac, triggered_classes=TRIGGERED_CLASSES) train_file = os.path.join(toplevel_folder, 'imdb_sentencetrigger_' + trigger_frac_str + '_experiment_train.csv') train_df.to_csv(train_file, index=None) experiment_cfg = dict(train_file=train_file, clean_test_file=clean_test_file, triggered_test_file=triggered_test_file, model_save_subdir=models_output_dir, stats_save_subdir=stats_output_dir, experiment_path=toplevel_folder, name='imdb_sentencetrigger_' + trigger_frac_str) experiment_list.append(experiment_cfg) return experiment_list
def create_clean_dataset(input_data_path: str, output_rootdir: str, output_train_csv_file: str, output_test_csv_file: str, train_fname_prefix: str, test_fname_prefix: str, xforms: Sequence[dg_transform.Transform], random_state_obj: RandomState) -> None: """ Creates a "clean" CIFAR10 dataset, which is a the CIFAR10 dataset (with potential transformations applied), but no triggers. :param input_data_path: root folder of the CIFAR10 dataset :param output_rootdir: the root directory into which the clean data will be stored. training data will be stored in: output_rootdir/train test data will be stored in: output_rootdir/test :param output_train_csv_file: a CSV file of the training data, which specifies paths to files, and their associated labels :param output_test_csv_file: a CSV file of the test data, which specifies paths to files, and their associated labels :param train_fname_prefix: a prefix to every training filename :param test_fname_prefix: a prefix to every test filename :param xforms: a dictionary which contains the necessary transformations to be applied to each input image. The configuration is validated by _validate_create_clean_dataset_cfgdict(), but at a high level, the dictionary must contain the 'transforms' key, and that must be a list of transformations to be applied. :param random_state_obj: object used to derive random states for each image that is generated :return: None """ # input error checking if not _validate_create_clean_dataset_cfgdict(xforms): raise ValueError("mod_cfg argument incorrectly specified!") # create a fresh version of the directory try: shutil.rmtree(output_rootdir) except IOError: pass X_train, y_train = load_dataset(input_data_path, 'train') X_test, y_test = load_dataset(input_data_path, 'test') train_output_subdir = 'train' test_output_subdir = 'test' # make necessary sub-directories try: os.makedirs(os.path.join(output_rootdir, train_output_subdir)) except IOError: pass try: os.makedirs(os.path.join(output_rootdir, test_output_subdir)) except IOError: pass random_state = random_state_obj.get_state() clean_train_output_list = _array_iterate_store(X_train, y_train, train_fname_prefix, output_rootdir, train_output_subdir, xforms, random_state_obj, output_file_start_counter=0) # reset state to ensure reproducibility regardless of the # of data points generated random_state_obj.set_state(random_state) clean_test_output_list = _array_iterate_store(X_test, y_test, test_fname_prefix, output_rootdir, test_output_subdir, xforms, random_state_obj, output_file_start_counter=0) keys = ['file', 'label'] with open(os.path.join(output_rootdir, output_train_csv_file), 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(clean_train_output_list) with open(os.path.join(output_rootdir, output_test_csv_file), 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(clean_test_output_list)
class IIDBootstrap(object): """ Bootstrap using uniform resampling Parameters ---------- args Positional arguments to bootstrap kwargs Keyword arguments to bootstrap Attributes ---------- index : array The current index of the bootstrap data : tuple Two-element tuple with the pos_data in the first position and kw_data in the second (pos_data, kw_data) pos_data : tuple Tuple containing the positional arguments (in the order entered) kw_data : dict Dictionary containing the keyword arguments random_state : RandomState RandomState instance used by bootstrap Notes ----- Supports numpy arrays and pandas Series and DataFrames. Data returned has the same type as the input date. Data entered using keyword arguments is directly accessibly as an attribute. Examples -------- Data can be accessed in a number of ways. Positional data is retained in the same order as it was entered when the bootstrap was initialized. Keyword data is available both as an attribute or using a dictionary syntax on kw_data. >>> from arch.bootstrap import IIDBootstrap >>> from numpy.random import standard_normal >>> y = standard_normal((500, 1)) >>> x = standard_normal((500,2)) >>> z = standard_normal(500) >>> bs = IIDBootstrap(x, y=y, z=z) >>> for data in bs.bootstrap(100): ... bs_x = data[0][0] ... bs_y = data[1]['y'] ... bs_z = bs.z """ def __init__(self, *args, **kwargs): self.random_state = RandomState() self._initial_state = self.random_state.get_state() self._args = args self._kwargs = kwargs if args: self._num_items = len(args[0]) elif kwargs: key = list(kwargs.keys())[0] self._num_items = len(kwargs[key]) all_args = list(args) all_args.extend([v for v in itervalues(kwargs)]) for arg in all_args: if len(arg) != self._num_items: raise ValueError("All inputs must have the same number of " "elements in axis 0") self._index = np.arange(self._num_items) self._parameters = [] self._seed = None self.pos_data = args self.kw_data = kwargs self.data = (args, kwargs) self._base = None self._results = None self._studentized_results = None self._last_func = None self._name = 'IID Bootstrap' for key, value in iteritems(kwargs): attr = getattr(self, key, None) if attr is None: self.__setattr__(key, value) else: raise ValueError(key + ' is a reserved name') def __str__(self): repr = self._name repr += '(no. pos. inputs: ' + str(len(self.pos_data)) repr += ', no. keyword inputs: ' + str(len(self.kw_data)) + ')' return repr def __repr__(self): return self.__str__()[:-1] + ', ID: ' + hex(id(self)) + ')' def _repr_html(self): html = '<strong>' + self._name + '</strong>(' html += '<strong>no. pos. inputs</strong>: ' + str(len(self.pos_data)) html += ', <strong>no. keyword inputs</strong>: ' + str(len(self.kw_data)) html += ', <strong>ID</strong>: ' + hex(id(self)) + ')' return html @property def index(self): """ Returns the current index of the bootstrap """ return self._index def get_state(self): """ Gets the state of the bootstrap's random number generator Returns ------- state : RandomState state vector Array containing the state """ return self.random_state.get_state() def set_state(self, state): """ Sets the state of the bootstrap's random number generator Parameters ---------- state : RandomState state vector Array containing the state """ return self.random_state.set_state(state) def seed(self, value): """ Seeds the bootstrap's random number generator Parameters ---------- value : int Integer to use as the seed """ self._seed = value self.random_state.seed(value) return None def reset(self, use_seed=True): """ Resets the bootstrap to either its initial state or the last seed. Parameters ---------- use_seed : bool, optional Flag indicating whether to use the last seed if provided. If False or if no seed has been set, the bootstrap will be reset to the initial state. Default is True """ self._index = np.arange(self._num_items) self._resample() self.random_state.set_state(self._initial_state) if use_seed and self._seed is not None: self.seed(self._seed) return None def bootstrap(self, reps): """ Iterator for use when bootstrapping Parameters ---------- reps : int Number of bootstrap replications Example ------- The key steps are problem dependent and so this example shows the use as an iterator that does not produce any output >>> from arch.bootstrap import IIDBootstrap >>> import numpy as np >>> bs = IIDBootstrap(np.arange(100), x=np.random.randn(100)) >>> for posdata, kwdata in bs.bootstrap(1000): ... # Do something with the positional data and/or keyword data ... pass .. note:: Note this is a generic example and so the class used should be the name of the required bootstrap Notes ----- The iterator returns a tuple containing the data entered in positional arguments as a tuple and the data entered using keywords as a dictionary """ for _ in range(reps): indices = np.asarray(self.update_indices()) self._index = indices yield self._resample() def conf_int(self, func, reps=1000, method='basic', size=0.95, tail='two', extra_kwargs=None, reuse=False, sampling='nonparametric', std_err_func=None, studentize_reps=1000): """ Parameters ---------- func : callable Function the computes parameter values. See Notes for requirements reps : int, optional Number of bootstrap replications method : string, optional One of 'basic', 'percentile', 'studentized', 'norm' (identical to 'var', 'cov'), 'bc' (identical to 'debiased', 'bias-corrected'), or 'bca' size : float, optional Coverage of confidence interval tail : string, optional One of 'two', 'upper' or 'lower'. reuse : bool, optional Flag indicating whether to reuse previously computed bootstrap results. This allows alternative methods to be compared without rerunning the bootstrap simulation. Reuse is ignored if reps is not the same across multiple runs, func changes across calls, or method is 'studentized'. sampling : string, optional Type of sampling to use: 'nonparametric', 'semi-parametric' (or 'semi') or 'parametric'. The default is 'nonparametric'. See notes about the changes to func required when using 'semi' or 'parametric'. extra_kwargs : dict, optional Extra keyword arguments to use when calling func and std_err_func, when appropriate std_err_func : callable, optional Function to use when standardizing estimated parameters when using the studentized bootstrap. Providing an analytical function eliminates the need for a nested bootstrap studentize_reps : int, optional Number of bootstraps to use in the innter component when using the studentized bootstrap. Ignored when ``std_err_func`` is provided Returns ------- intervals : 2-d array Computed confidence interval. Row 0 contains the lower bounds, and row 1 contains the upper bounds. Each column corresponds to a parameter. When tail is 'lower', all upper bounds are inf. Similarly, 'upper' sets all lower bounds to -inf. Examples -------- >>> import numpy as np >>> def func(x): ... return x.mean(0) >>> y = np.random.randn(1000, 2) >>> from arch.bootstrap import IIDBootstrap >>> bs = IIDBootstrap(y) >>> ci = bs.conf_int(func, 1000) Notes ----- When there are no extra keyword arguments, the function is called .. code:: python func(*args, **kwargs) where args and kwargs are the bootstrap version of the data provided when setting up the bootstrap. When extra keyword arguments are used, these are appended to kwargs before calling func. The standard error function, if provided, must return a vector of parameter standard errors and is called .. code:: python std_err_func(params, *args, **kwargs) where ``params`` is the vector of estimated parameters using the same bootstrap data as in args and kwargs. The bootstraps are: * 'basic' - Basic confidence using the estimated parameter and difference between the estimated parameter and the bootstrap parameters * 'percentile' - Direct use of bootstrap percentiles * 'norm' - Makes use of normal approximation and bootstrap covariance estimator * 'studentized' - Uses either a standard error function or a nested bootstrap to estimate percentiles and the bootstrap covariance for scale * 'bc' - Bias corrected using estimate bootstrap bias correction * 'bca' - Bias corrected and accelerated, adding acceleration parameter to 'bc' method """ studentized = 'studentized' if not 0.0 < size < 1.0: raise ValueError('size must be strictly between 0 and 1') tail = tail.lower() if tail not in ('two', 'lower', 'upper'): raise ValueError('tail must be one of two-sided, lower or upper') studentize_reps = studentize_reps if method == studentized else 0 _reuse = False if reuse: # check conditions for reuse _reuse = (self._results is not None and len(self._results) == reps and method != studentized and self._last_func is func) if not _reuse: if reuse: import warnings warn = 'The conditions to reuse the previous bootstrap has ' \ 'not been satisfied. A new bootstrap will be constructed' warnings.warn(warn, RuntimeWarning) self._construct_bootstrap_estimates(func, reps, extra_kwargs, std_err_func=std_err_func, studentize_reps=studentize_reps, sampling=sampling) base, results = self._base, self._results studentized_results = self._studentized_results std_err = [] if method in ('norm', 'var', 'cov', studentized): errors = results - results.mean(axis=0) std_err = np.sqrt(np.diag(errors.T.dot(errors) / reps)) if tail == 'two': alpha = (1.0 - size) / 2 else: alpha = (1.0 - size) percentiles = [alpha, 1.0 - alpha] norm_quantiles = stats.norm.ppf(percentiles) if method in ('norm', 'var', 'cov'): lower = base + norm_quantiles[0] * std_err upper = base + norm_quantiles[1] * std_err elif method in ('percentile', 'basic', studentized, 'debiased', 'bc', 'bias-corrected', 'bca'): values = results if method == studentized: # studentized uses studentized parameter estimates values = studentized_results if method in ('debiased', 'bc', 'bias-corrected', 'bca'): # bias corrected uses modified percentiles, but is # otherwise identical to the percentile method p = (results < base).mean(axis=0) b = stats.norm.ppf(p) b = b[:, None] if method == 'bca': nobs = self._num_items jk_params = _loo_jackknife(func, nobs, self._args, self._kwargs) u = (nobs - 1) * (jk_params - base) numer = np.sum(u ** 3, 0) denom = 6 * (np.sum(u ** 2, 0) ** (3.0 / 2.0)) small = denom < (np.abs(numer) * np.finfo(np.float64).eps) if small.any(): message = 'Jackknife variance estimate {jk_var} is ' \ 'too small to use BCa' raise RuntimeError(message.format(jk_var=denom)) a = numer / denom a = a[:, None] else: a = 0.0 percentiles = stats.norm.cdf(b + (b + norm_quantiles) / (1.0 - a * (b + norm_quantiles))) percentiles = list(100 * percentiles) else: percentiles = [100 * p for p in percentiles] # Rescale if method not in ('bc', 'debiased', 'bias-corrected', 'bca'): ci = np.asarray(np.percentile(values, percentiles, axis=0)) lower = ci[0, :] upper = ci[1, :] else: k = values.shape[1] lower = np.zeros(k) upper = np.zeros(k) for i in range(k): lower[i], upper[i] = np.percentile(values[:, i], list(percentiles[i])) # Basic and studentized use the lower empirical quantile to # compute upper and vice versa. Bias corrected and percentile use # upper to estimate the upper, and lower to estimate the lower if method == 'basic': lower_copy = lower + 0.0 lower = 2.0 * base - upper upper = 2.0 * base - lower_copy elif method == studentized: lower_copy = lower + 0.0 lower = base - upper * std_err upper = base - lower_copy * std_err else: raise ValueError('Unknown method') if tail == 'lower': upper = np.zeros_like(base) upper.fill(np.inf) elif tail == 'upper': lower = np.zeros_like(base) lower.fill(-1 * np.inf) return np.vstack((lower, upper)) def clone(self, *args, **kwargs): """ Clones the bootstrap using different data. Parameters ---------- args Positional arguments to bootstrap kwargs Keyword arguments to bootstrap Returns ------- bs Bootstrap instance """ pos_arguments = copy.deepcopy(self._parameters) pos_arguments.extend(args) bs = self.__class__(*pos_arguments, **kwargs) if self._seed is not None: bs.seed(self._seed) return bs def apply(self, func, reps=1000, extra_kwargs=None): """ Applies a function to bootstrap replicated data Parameters ---------- func : callable Function the computes parameter values. See Notes for requirements reps : int, optional Number of bootstrap replications extra_kwargs : dict, optional Extra keyword arguments to use when calling func. Must not conflict with keyword arguments used to initialize bootstrap Returns ------- results : array reps by nparam array of computed function values where each row corresponds to a bootstrap iteration Notes ----- When there are no extra keyword arguments, the function is called .. code:: python func(params, *args, **kwargs) where args and kwargs are the bootstrap version of the data provided when setting up the bootstrap. When extra keyword arguments are used, these are appended to kwargs before calling func Examples -------- >>> import numpy as np >>> x = np.random.randn(1000,2) >>> from arch.bootstrap import IIDBootstrap >>> bs = IIDBootstrap(x) >>> def func(y): ... return y.mean(0) >>> results = bs.apply(func, 100) """ kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs) base = func(*self._args, **kwargs) try: num_params = base.shape[0] except: num_params = 1 results = np.zeros((reps, num_params)) count = 0 for pos_data, kw_data in self.bootstrap(reps): kwargs = _add_extra_kwargs(kw_data, extra_kwargs) results[count] = func(*pos_data, **kwargs) count += 1 return results def _construct_bootstrap_estimates(self, func, reps, extra_kwargs=None, std_err_func=None, studentize_reps=0, sampling='nonparametric'): # Private, more complicated version of apply self._last_func = func semi = parametric = False if sampling == 'parametric': parametric = True elif sampling == 'semiparametric': semi = True if extra_kwargs is not None: if any(k in self._kwargs for k in extra_kwargs): raise ValueError('extra_kwargs contains keys used for variable' ' names in the bootstrap') kwargs = _add_extra_kwargs(self._kwargs, extra_kwargs) base = func(*self._args, **kwargs) num_params = 1 if np.isscalar(base) else base.shape[0] results = np.zeros((reps, num_params)) studentized_results = np.zeros((reps, num_params)) count = 0 for pos_data, kw_data in self.bootstrap(reps): kwargs = _add_extra_kwargs(kw_data, extra_kwargs) if parametric: kwargs['state'] = self.random_state kwargs['params'] = base elif semi: kwargs['params'] = base results[count] = func(*pos_data, **kwargs) if std_err_func is not None: std_err = std_err_func(results[count], *pos_data, **kwargs) studentized_results[count] = (results[count] - base) / std_err elif studentize_reps > 0: # Need new bootstrap of same type nested_bs = self.clone(*pos_data, **kw_data) # Set the seed to ensure reproducability seed = self.random_state.randint(2 ** 31 - 1) nested_bs.seed(seed) cov = nested_bs.cov(func, studentize_reps, extra_kwargs=extra_kwargs) std_err = np.sqrt(np.diag(cov)) studentized_results[count] = (results[count] - base) / std_err count += 1 self._base = np.asarray(base) self._results = np.asarray(results) self._studentized_results = np.asarray(studentized_results) def cov(self, func, reps=1000, recenter=True, extra_kwargs=None): """ Compute parameter covariance using bootstrap Parameters ---------- func : callable Callable function that returns the statistic of interest as a 1-d array reps : int, optional Number of bootstrap replications recenter : bool, optional Whether to center the bootstrap variance estimator on the average of the bootstrap samples (True) or to center on the original sample estimate (False). Default is True. extra_kwargs: dict, optional Dictionary of extra keyword arguments to pass to func Returns ------- cov: array Bootstrap covariance estimator Notes ----- func must have the signature .. code:: python func(params, *args, **kwargs) where params are a 1-dimensional array, and `*args` and `**kwargs` are data used in the the bootstrap. The first argument, params, will be none when called using the original data, and will contain the estimate computed using the original data in bootstrap replications. This parameter is passed to allow parametric bootstrap simulation. Example ------- Bootstrap covariance of the mean >>> from arch.bootstrap import IIDBootstrap >>> import numpy as np >>> def func(x): ... return x.mean(axis=0) >>> y = np.random.randn(1000, 3) >>> bs = IIDBootstrap(y) >>> cov = bs.cov(func, 1000) Bootstrap covariance using a function that takes additional input >>> def func(x, stat='mean'): ... if stat=='mean': ... return x.mean(axis=0) ... elif stat=='var': ... return x.var(axis=0) >>> cov = bs.cov(func, 1000, extra_kwargs={'stat':'var'}) .. note:: Note this is a generic example and so the class used should be the name of the required bootstrap """ self._construct_bootstrap_estimates(func, reps, extra_kwargs) base, results = self._base, self._results if recenter: errors = results - np.mean(results, 0) else: errors = results - base return errors.T.dot(errors) / reps def var(self, func, reps=1000, recenter=True, extra_kwargs=None): """ Compute parameter variance using bootstrap Parameters ---------- func : callable Callable function that returns the statistic of interest as a 1-d array reps : int, optional Number of bootstrap replications recenter : bool, optional Whether to center the bootstrap variance estimator on the average of the bootstrap samples (True) or to center on the original sample estimate (False). Default is True. extra_kwargs: dict, optional Dictionary of extra keyword arguments to pass to func Returns ------- var : 1-d array Bootstrap variance estimator Notes ----- func must have the signature .. code:: python func(params, *args, **kwargs) where params are a 1-dimensional array, and `*args` and `**kwargs` are data used in the the bootstrap. The first argument, params, will be none when called using the original data, and will contain the estimate computed using the original data in bootstrap replications. This parameter is passed to allow parametric bootstrap simulation. Example ------- Bootstrap covariance of the mean >>> from arch.bootstrap import IIDBootstrap >>> import numpy as np >>> def func(x): ... return x.mean(axis=0) >>> y = np.random.randn(1000, 3) >>> bs = IIDBootstrap(y) >>> variances = bs.var(func, 1000) Bootstrap covariance using a function that takes additional input >>> def func(x, stat='mean'): ... if stat=='mean': ... return x.mean(axis=0) ... elif stat=='var': ... return x.var(axis=0) >>> variances = bs.var(func, 1000, extra_kwargs={'stat': 'var'}) .. note:: Note this is a generic example and so the class used should be the name of the required bootstrap """ self._construct_bootstrap_estimates(func, reps, extra_kwargs) base, results = self._base, self._results if recenter: errors = results - np.mean(results, 0) else: errors = results - base return (errors ** 2).sum(0) / reps def update_indices(self): """ Update indices for the next iteration of the bootstrap. This must be overridden when creating new bootstraps. """ return self.random_state.randint(self._num_items, size=self._num_items) def _resample(self): """ Resample all data using the values in _index """ indices = self._index pos_data = [] for values in self._args: if isinstance(values, (pd.Series, pd.DataFrame)): pos_data.append(values.iloc[indices]) else: pos_data.append(values[indices]) named_data = {} for key, values in iteritems(self._kwargs): if isinstance(values, (pd.Series, pd.DataFrame)): named_data[key] = values.iloc[indices] else: named_data[key] = values[indices] setattr(self, key, named_data[key]) self.pos_data = pos_data self.kw_data = named_data self.data = (pos_data, named_data) return self.data
class RestrictedBoltzmannMachine: # #initialization methods # def __init__(self, visibleLayer, hiddenLayer, temperature=1., sigma=0.01, visibleProportionOn=None, parameterFile=None, rng=None, rngState=None, rngSeed=1337): self.visibleLayer = visibleLayer self.hiddenLayer = hiddenLayer self.temperature = temperature self.beta = 1. / self.temperature if rng is None: self.rng = RandomState(seed=rngSeed) if rngState is not None: self.rng.set_state(rngState) else: self.rng = rng if parameterFile is None: self.initializeVisibleBias(visibleProportionOn=visibleProportionOn) self.initializeHiddenBias() self.initializeWeights(sigma) else: self.loadParameterFile(parameterFile) self.visibleStep = np.zeros_like(self.visibleBias) self.hiddenStep = np.zeros_like(self.hiddenBias) self.weightStep = np.zeros_like(self.weights) def initializeVisibleBias(self, visibleProportionOn=None): if visibleProportionOn is None: self.visibleBias = np.zeros(self.visibleLayer.shape[-1]) else: #find minimum non-zero value nonZeroMin = visibleProportionOn[visibleProportionOn > 0.].min() visibleProportionOn[np.isclose( visibleProportionOn, 0.)] = nonZeroMin + (0. - nonZeroMin) / 2. nonOneMax = visibleProportionOn[visibleProportionOn < 1.].max() print(f'nonZeroMin, nonOneMax: {nonZeroMin}, {nonOneMax}') visibleProportionOn[np.isclose( visibleProportionOn, 1.)] = nonOneMax + (1. - nonOneMax) / 2. self.visibleBias = np.log(visibleProportionOn / (1. - visibleProportionOn)) #self.visibleBias = 1. / visibleProportionOn def initializeHiddenBias(self): self.hiddenBias = np.zeros(self.hiddenLayer.shape[-1]) def initializeWeights(self, sigma=0.01): self.weights = self.rng.normal(scale=sigma, size=(self.visibleLayer.shape[-1], self.hiddenLayer.shape[-1])) def loadParameterFile(self, parameterFile): lv = self.visibleLayer.shape[-1] lh = self.hiddenLayer.shape[-1] visibleSlice = slice(0, lv) hiddenSlice = slice(lv, lv + lh) weightsSlice = slice(lv + lh, lv + lh + lv * lh) fileContents = [float(line.strip()) for line in parameterFile] self.visibleBias = np.array(fileContents[visibleSlice]) self.hiddenBias = np.array(fileContents[hiddenSlice]) self.weights = np.array(fileContents[weightsSlice]).reshape((lv, lh)) def dumpParameterFile(self, parameterFile): #assert type(parameterFile) == file for theta in self.visibleBias: print(f'{theta}', file=parameterFile) for theta in self.hiddenBias: print(f'{theta}', file=parameterFile) for theta in self.weights.flatten(): print(f'{theta}', file=parameterFile) # #prediction methods # def hiddenConditionalProbabilities(self): conditionalEnergies = self.hiddenBias + self.visibleLayer @ self.weights return logistic(self.beta * conditionalEnergies) def visibleConditionalProbabilities(self): conditionalEnergies = self.visibleBias + self.hiddenLayer @ self.weights.T return logistic(self.beta * conditionalEnergies) def rollBernoulliProbabilities(self, probabilities): rolls = self.rng.uniform(size=probabilities.shape) return (rolls < probabilities).astype(np.float_) def gibbsSample(self, hiddenUnitsStochastic=False): #compute hidden activation probabilities given visible hiddenLayerProbabilities = self.hiddenConditionalProbabilities() if hiddenUnitsStochastic: self.hiddenLayer = self.rollBernoulliProbabilities( hiddenLayerProbabilities) else: self.hiddenLayer = hiddenLayerProbabilities #compute visible activation probabilities given hidden self.visibleLayer = self.visibleConditionalProbabilities() return self.visibleLayer, hiddenLayerProbabilities # #training methods # def computePCDGradient(self, miniBatch, miniFantasyBatch, nCDSteps=1, l1Coefficient=None, l2Coefficient=None): visibleDataMean, hiddenDataMean, weightDataMean = self.computePCDGradientPositiveHalf( miniBatch) visibleModelMean, hiddenModelMean, weightModelMean, newFantasy = \ self.computePCDGradientNegativeHalf(miniFantasyBatch, nCDSteps=nCDSteps) #compute gradients & return visibleGradient = visibleDataMean - visibleModelMean hiddenGradient = hiddenDataMean - hiddenModelMean weightGradient = weightDataMean - weightModelMean if l1Coefficient is not None: weightGradient -= l1Coefficient * np.sign(self.weights) if l2Coefficient is not None: weightGradient -= l2Coefficient * self.weights return visibleGradient, hiddenGradient, weightGradient, newFantasy def computePCDGradientPositiveHalf(self, miniBatch): self.visibleLayer = miniBatch hiddenLayerProbabilities = self.hiddenConditionalProbabilities() return self.computeParameterMeans(miniBatch, hiddenLayerProbabilities) def computePCDGradientNegativeHalf(self, miniFantasyBatch, nCDSteps=1): self.visibleLayer = miniFantasyBatch for _ in range(nCDSteps): visibleOut, hiddenOut = self.gibbsSample() visibleModelMean, hiddenModelMean, weightModelMean = \ self.computeParameterMeans(visibleOut, hiddenOut) #store for possible use by adversary self.visibleModel = visibleOut self.hiddenModel = hiddenOut self.visibleModelMean = visibleModelMean self.hiddenModelMean = hiddenModelMean self.weightModelMean = weightModelMean return visibleModelMean, hiddenModelMean, weightModelMean, visibleOut def computeParameterMeans(self, visible, hidden): visibleMean = visible.mean(axis=0) hiddenMean = hidden.mean(axis=0) weightMean = (visible[..., :, None] * hidden[..., None, :]).mean(axis=0) #weightMean = visibleMean[..., :, None] * hiddenMean[..., None, :] * visible.shape[0] return visibleMean, hiddenMean, weightMean def updateParameters(self): self.visibleBias += self.visibleStep self.hiddenBias += self.hiddenStep self.weights += self.weightStep def updateParametersSGD(self, miniBatch, miniFantasyBatch, learningRate, nCDSteps=1, l1Coefficient=None, l2Coefficient=None, verbose=False): visibleGradient, hiddenGradient, weightGradient, newFantasy = \ self.computePCDGradient(miniBatch, miniFantasyBatch, nCDSteps=nCDSteps, l1Coefficient=l1Coefficient, l2Coefficient=l2Coefficient) #hack to stop changing the *Step pointer; req'd for # current implementation of histograms of *Steps self.visibleStep += learningRate * visibleGradient - self.visibleStep self.hiddenStep += learningRate * hiddenGradient - self.hiddenStep self.weightStep += learningRate * weightGradient - self.weightStep self.updateParameters() if verbose is True: print('{:.3f}\t{:.3f}\t{:.3f}'.format(self.visibleStep.mean(), self.hiddenStep.mean(), self.weightStep.mean())) return newFantasy def updateParametersAdam(self, miniBatch, miniFantasyBatch, adams, nCDSteps=1, l1Coefficient=None, l2Coefficient=None, verbose=False): visibleGradient, hiddenGradient, weightGradient, newFantasy = \ self.computePCDGradient(miniBatch, miniFantasyBatch, nCDSteps=nCDSteps, l1Coefficient=l1Coefficient, l2Coefficient=l2Coefficient) #hack to stop changing the *Step pointer; req'd for # current implementation of histograms of *Steps self.visibleStep += adams['visible'].computeAdamStep( visibleGradient) - self.visibleStep self.hiddenStep += adams['hidden'].computeAdamStep( hiddenGradient) - self.hiddenStep self.weightStep += adams['weights'].computeAdamStep( weightGradient) - self.weightStep self.updateParameters() if verbose is True: print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format( visibleGradient.mean(), hiddenGradient.mean(), weightGradient.mean(), self.visibleStep.mean(), self.hiddenStep.mean(), self.weightStep.mean())) return newFantasy def updateParametersAdamAdversarial(self, miniBatch, miniFantasyBatch, adams, gamma, adversary, nCDSteps=1, l1Coefficient=None, l2Coefficient=None, verbose=False): visibleGradient, hiddenGradient, weightGradient, newFantasy = \ self.computePCDGradient(miniBatch, miniFantasyBatch, nCDSteps=nCDSteps, l1Coefficient=l1Coefficient, l2Coefficient=l2Coefficient) visisbleGradientAd, hiddenGradientAd, weightGradientAd = self.computeAdversaryGradient( adversary) #hack to stop changing the *Step pointer; req'd for # current implementation of histograms of *Steps self.visibleStep += adams['visible'].computeAdamStep( visibleGradient + visibleGradientAd) - self.visibleStep self.hiddenStep += adams['hidden'].computeAdamStep( hiddenGradient + hiddenGradientAd) - self.hiddenStep self.weightStep += adams['weights'].computeAdamStep( weightGradient + weightGradientAd) - self.weightStep self.updateParameters() if verbose is True: print('{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}'.format( visibleGradient.mean(), hiddenGradient.mean(), weightGradient.mean(), self.visibleStep.mean(), self.hiddenStep.mean(), self.weightStep.mean())) return newFantasy def computeAdversaryGradient(self, adversary): adversaryPredictions = adversary.predict(miniFantasyBatch) adversaryPredictionVariation = adversaryPredictions - adversaryPredictions.mean( ) visibleModelVariation = self.visibleModel - self.visibleModelMean hiddenModelVariation = self.hiddenModel - self.hiddenModelMean weightModelVariation = self.visibleModel[ ..., :, None] * self.hiddenModel[..., None, :] - self.weightModelMean visibleGradient = (adversaryPredictionVariation[:, None] * visibleModelVariation).mean(axis=0) hiddenGradient = (adversaryPredictionVariation[:, None] * hiddenModelVariation).mean(axis=0) weightGradient = (adversaryPredictionVariation[:, None, None] * weightModelVariation).mean(axis=0) return visibleGradient, hiddenGradient, weightGradient # #analysis methods # def computeReconstructionError(self, miniBatch, nCDSteps=1): self.visibleLayer = miniBatch for _ in range(nCDSteps): visibleOut, hiddenOut = self.gibbsSample() #visibleOut = self.rollBernoulliProbabilities(visibleOut) sampleError = miniBatch - visibleOut meanSquaredError = (sampleError * sampleError).mean() return meanSquaredError def computeFreeEnergy(self, miniBatch=None): if miniBatch is not None: self.visibleLayer = miniBatch internalFE = -self.visibleLayer @ self.visibleBias externalConditionalE = self.hiddenBias + self.visibleLayer @ self.weights externalFE = -np.log(1. + np.exp(externalConditionalE)).sum(axis=1) return internalFE + externalFE def computeMeanFreeEnergy(self, miniBatch=None): return self.computeFreeEnergy(miniBatch).mean() # #miscellaneous methods # def copy(self): copyRBM = RestrictedBoltzmannMachine(np.copy(self.visibleLayer), np.copy(self.hiddenLayer), temperature=self.temperature, rngState=self.rng.get_state()) copyRBM.visibleBias = np.copy(self.visibleBias) copyRBM.hiddenBias = np.copy(self.hiddenBias) copyRBM.weights = np.copy(self.weights) copyRBM.visibleStep = np.copy(self.visibleStep) copyRBM.hiddenStep = np.copy(self.hiddenStep) copyRBM.weightStep = np.copy(self.weightStep) return copyRBM def storeHiddenActivationsOnMiniBatch(self, miniBatch, hiddenUnits=None): self.visibleLayer = miniBatch self.hiddenConditionalProbabilities() return np.copy(self.hiddenLayer) if hiddenUnits is None \ else np.copy(self.hiddenLayer[..., hiddenUnits]) def setRngSeed(self, rngSeed): self.rng.seed(rngSeed) def __len__(self): return self.visibleLayer.shape[-1], self.hiddenLayer.shape[-1]
def generate_experiments(toplevel_folder: str, clean_train_csv_file: str, clean_test_csv_file: str, train_output_subdir: str, test_output_subdir: str, models_output_dir: str, stats_output_dir: str, dataset_name: str = 'imdb', triggered_fracs=DEFAULT_TRIGGER_FRACS, trigger_cfg=DEFAULT_SEQ_INSERT_TRIGGER_CFG, trigger_behavior: tdb.LabelBehavior = tdb.WrappedAdd( 1, 2)): """ Generate an experiment list, given the necessary configurations :param toplevel_folder: the root folder under which the data lives :param clean_train_csv_file: csv file pointing to the clean training data, used when querying data to modify :param clean_test_csv_file: csv file pointing to the clean test data, used when querying data to modify :param train_output_subdir: subdirectory (under <toplevel_folder>/<dataset_name>_clean/) where training data will be stored :param test_output_subdir: subdirectory (under <toplevel_folder>/<dataset_name>_triggered) where test data will be stored :param models_output_dir: directory where trained models should be stored :param stats_output_dir: directory where statistics should be stored :param dataset_name: the name of the dataset, used for autonaming some folders :param triggered_fracs: a list of the fraction of data which should be triggered :param trigger_cfg: :param trigger_behavior """ master_random_state_object = RandomState(MASTER_SEED) start_state = master_random_state_object.get_state() master_random_state_object.set_state(start_state) clean_dataset_rootdir = os.path.join(toplevel_folder, dataset_name + '_clean') triggered_dataset_rootdir = os.path.join(toplevel_folder, dataset_name + '_triggered') tdx.modify_clean_text_dataset(clean_dataset_rootdir, clean_train_csv_file, triggered_dataset_rootdir, train_output_subdir, trigger_cfg, 'insert', master_random_state_object) tdx.modify_clean_text_dataset(clean_dataset_rootdir, clean_test_csv_file, triggered_dataset_rootdir, test_output_subdir, trigger_cfg, 'insert', master_random_state_object) # now create experiments from the generated data. Here, we generate 3 CSV files per experiment configuration. A # train file, a clean_test file, and a triggered_test file. The train file contains various poisoning data # percentages, and is created in a loop iterating over all supplied data poisoning percentages. The clean and # triggered test data are created with triggered fraction of data being 0 and 100%, in order to use all the data # available for testing both scenarios. # create clean & triggered data for test. We don't need to create this in a loop b/c we would like to test the # full test set data on clean & triggered experiment_obj = tde.ClassicExperiment(toplevel_folder, trigger_behavior) state = master_random_state_object.get_state() test_clean_df, _ = experiment_obj.create_experiment( os.path.join(clean_dataset_rootdir, 'test_clean.csv'), os.path.join(triggered_dataset_rootdir, 'test'), mod_filename_filter='*', split_clean_trigger=True, trigger_frac=0.0, triggered_classes=trigger_cfg.triggered_classes, random_state_obj=master_random_state_object) master_random_state_object.set_state(state) _, test_triggered_df = experiment_obj.create_experiment( os.path.join(clean_dataset_rootdir, 'test_clean.csv'), os.path.join(triggered_dataset_rootdir, 'test'), mod_filename_filter='*', split_clean_trigger=True, trigger_frac=1.0, triggered_classes=trigger_cfg.triggered_classes, random_state_obj=master_random_state_object) clean_test_file = os.path.join(toplevel_folder, dataset_name + '_experiment_test_clean.csv') triggered_test_file = os.path.join( toplevel_folder, dataset_name + '_experiment_test_triggered.csv') test_clean_df.to_csv(clean_test_file, index=None) test_triggered_df.to_csv(triggered_test_file, index=None) # create triggered data experiment for training experiment_list = [] for trigger_frac in triggered_fracs: trigger_frac_str = '%0.02f' % (trigger_frac, ) train_df = experiment_obj.create_experiment( os.path.join(clean_dataset_rootdir, 'train_clean.csv'), os.path.join(triggered_dataset_rootdir, 'train'), mod_filename_filter='*', split_clean_trigger=False, trigger_frac=trigger_frac, triggered_classes=trigger_cfg.triggered_classes) train_file = os.path.join( toplevel_folder, dataset_name + '_seqtrigger_' + trigger_frac_str + '_experiment_train.csv') train_df.to_csv(train_file, index=None) experiment_cfg = dict(train_file=train_file, clean_test_file=clean_test_file, triggered_test_file=triggered_test_file, model_save_subdir=models_output_dir, stats_save_subdir=stats_output_dir, experiment_path=toplevel_folder, name=dataset_name + '_sentencetrigger_' + trigger_frac_str) experiment_list.append(experiment_cfg) return experiment_list
class Generator(): seed = None random = None def __init__(self, seed=1): super(Generator, self).__init__() self.random = RandomState(seed) self.seed = seed def reseed(self): self.random = RandomState(self.seed) def randSyllable(self): c1_dice = ( self.random.random_sample() < 0.91 ) #Chance that a regular consonant will start the syllable s1_dice = ( self.random.random_sample() < 0.05 ) #Chance that a special conjunction consonant is used v1_dice = ( self.random.random_sample() < 0.85 ) #Chance that a regular vowel will be used c2_add_dice = ( self.random.random_sample() < 0.28 ) #Chance that it has an ending consonant c2_dice = ( self.random.random_sample() < 0.91 ) #Chance that a regular consonant will end the syllable s2_dice = ( self.random.random_sample() < 0.03 ) #Chance that the ending has an addon consonant c1 = self.random.choice(REGULAR_CONSONANTS) if c1_dice else self.random.choice(COMPOSITE_CONSONANTS) s1 = self.random.choice(SPECIAL_CONSONANTS) if s1_dice else '' v1 = self.random.choice(REGULAR_VOWELS) if v1_dice else self.random.choice(COMPOSITE_VOWELS) c2 = ( self.random.choice(REGULAR_CONSONANTS) if c2_dice else self.random.choice(ENDING_CONSONANTS) ) if c2_add_dice else '' s2 = self.random.choice(ADDON_ENDING_CONSONANTS) if s2_dice else '' syllable = c1+s1+v1+c2+s2 # print(syllable) return syllable def randWord(self, s=2): """ s = number of syllables in int """ word = '' for syllable in range(0, s): word += self.randSyllable() return word def randSentence(self, meter=[2, 2, 1, 2, 3, 2, 1, 2, 2]): sentence = [] for syllable in meter: sentence.append(self.randWord(syllable)) return ' '.join(sentence) def randParagraph(self): paragraph = [] rand_wordcount = [ self.random.randint(3, 6) for i in range(0, self.random.randint( 4, 5 )) ] for words in rand_wordcount: rand_meter = [ self.random.randint(1, 4) for i in range(0, words) ] sentence = self.randSentence(rand_meter) paragraph.append(sentence) return '. '.join(paragraph) def randDictionary(self, word_list=['apple', 'banana', 'cake', 'dog', 'elephant', 'fruit', 'guava', 'human', 'island', 'joke', 'king', 'love', 'mother', 'nature', 'ocean', 'pie', 'queen', 'random', 'start', 'tree', 'up', 'vine', 'wisdom', 'yellow', 'zoo' ]): rand_dict_e2r = { word: self.randWord() for word in word_list } rand_dict_r2e = { v: k for k, v in rand_dict_e2r.items() } ordered_e2r = OrderedDict() print("English to Random Language") for key in sorted(rand_dict_e2r.keys()): print(key+ ' : '+rand_dict_e2r[key]) ordered_e2r[key] = rand_dict_e2r[key] ordered_r2e = OrderedDict() print("\n\nRandom Language to English") for key in sorted(rand_dict_r2e.keys()): print(key+ ' : '+rand_dict_r2e[key]) ordered_r2e[key] = rand_dict_r2e[key] return ( ordered_e2r, ordered_r2e ) def convertWord(self, word): word = word.lower() saved_state = self.random.get_state() # Word mapping method : md5 # To make it more natural, this mapping should be updated # to reflect natural language patterns md5 = hashlib.md5(bytes(word, encoding='utf-8')) wordseed = ( self.seed + int.from_bytes(md5.digest(), 'little') ) % (2**31) # print(wordseed) self.random.seed( wordseed ) randword = self.randWord( math.ceil( abs( self.random.normal(2, 1) ) ) ) self.random.set_state(saved_state) return randword def convertSentence(self, sentence): words = sentence.split() converted = [self.convertWord(word) for word in words] return ' '.join(converted)