Beispiel #1
0
    def _clean_data(self):
        """
        Sanitize the assigned data
        :param: self
        :return: None
        """
        data = self.data

        # if data is a list-like, convert to 1D np.array
        if isinstance(data, LISTLIKE):
            data = np.array(data).ravel()
        elif isinstance(data, set):
            data = np.array(list(data)).ravel()
        else:
            raise ControlledError(
                "Error: could not cast data into an np.array")

        # Check that entries are numbers
        check(all([isinstance(n, numbers.Real) for n in data]),
              'not all entries in data are real numbers')

        # Cast as 1D np.array of floats
        data = data.astype(float)

        # Keep only finite numbers
        data = data[np.isfinite(data)]

        try:
            if not (len(data) > 0):
                raise ControlledError(
                    'Input check failed, data must have length > 0: data = %s'
                    % data)
        except ControlledError as e:
            print(e)
            sys.exit(1)

        try:
            data_spread = max(data) - min(data)
            if not np.isfinite(data_spread):
                raise ControlledError(
                    'Input check failed. Data[max]-Data[min] is not finite: Data spread = %s'
                    % data_spread)
        except ControlledError as e:
            print(e)
            sys.exit(1)

        try:
            if not (data_spread > 0):
                raise ControlledError(
                    'Input check failed. Data[max]-Data[min] must be > 0: data_spread = %s'
                    % data_spread)
        except ControlledError as e:
            print(e)
            sys.exit(1)

        # Set cleaned data
        self.data = data
Beispiel #2
0
    def __init__(self, dataset='old_faithful_eruption_times'):

        # Check that dataset is valid
        check(dataset in self.list(),
              'Distribution "%s" not recognized.' % dataset)

        # Set file dataset
        file_name = '%s/%s.txt' % (data_dir, dataset)

        # Load data
        self._load_dataset(file_name)
Beispiel #3
0
def demo(example='real_data'):
    """
    Performs a demonstration of suftware.

    Parameters
    ----------

    example: (str)
        A string specifying which demo to run. Must be 'real_data',
        'simulated_data', or 'custom_data'.

    Return
    ------

    None.
    """

    import os
    example_dir = os.path.dirname(__file__)

    example_dict = {
        'custom_data': 'docs/example_custom.py',
        'simulated_data': 'docs/example_wide.py',
        'real_data': 'docs/example_alcohol.py'
    }

    check(example in example_dict,
          'example = %s is not valid. Must be one of %s'%\
          (example, example_dict.keys()))

    file_name = '%s/%s' % (example_dir, example_dict[example])
    with open(file_name, 'r') as f:
        content = f.read()
        line = '-------------------------------------------------------------'
        print('Running %s:\n%s\n%s\n%s'%\
              (file_name, line, content, line))
    exec(open(file_name).read())
Beispiel #4
0
    def _inputs_check(self):
        """
        Check all inputs NOT having to do with the choice of grid
        :param self:
        :return: None
        """

        if self.grid_spacing is not None:

            # max_t_step is a number
            check(
                isinstance(self.grid_spacing, numbers.Real),
                'type(grid_spacing) = %s; must be a number' %
                type(self.grid_spacing))

            # grid_spacing is positive
            check(self.grid_spacing > 0,
                  'grid_spacing = %f; must be > 0.' % self.grid_spacing)

        if self.grid is not None:

            # grid is a list or np.array
            types = (list, np.ndarray, np.matrix)
            check(
                isinstance(self.grid, types),
                'type(grid) = %s; must be a list or np.ndarray' %
                type(self.grid))

            # cast grid as np.array as ints
            try:
                self.grid = np.array(self.grid).ravel().astype(float)
            except:  # SHOULD BE MORE SPECIFIC
                raise ControlledError(
                    'Cannot cast grid as 1D np.array of floats.')

            # grid has appropriate number of points
            check(
                2 * self.alpha <= len(self.grid) <= MAX_NUM_GRID_POINTS,
                'len(grid) = %d; must have %d <= len(grid) <= %d.' %
                (len(self.grid), 2 * self.alpha, MAX_NUM_GRID_POINTS))

            # grid is ordered
            diffs = np.diff(self.grid)
            check(all(diffs > 0), 'grid is not monotonically increasing.')

            # grid is evenly spaced
            check(
                all(np.isclose(diffs, diffs.mean())),
                'grid is not evenly spaced; grid spacing = %f +- %f' %
                (diffs.mean(), diffs.std()))

        # alpha is int
        check(isinstance(self.alpha, int),
              'type(alpha) = %s; must be int.' % type(self.alpha))

        # alpha in range
        check(1 <= self.alpha <= 4,
              'alpha = %d; must have 1 <= alpha <= 4' % self.alpha)

        if self.num_grid_points is not None:

            # num_grid_points is an integer
            check(
                isinstance(self.num_grid_points,
                           int), 'type(num_grid_points) = %s; must be int.' %
                type(self.num_grid_points))

            # num_grid_points is in the right range
            check(
                2 * self.alpha <= self.num_grid_points <= MAX_NUM_GRID_POINTS,
                'num_grid_points = %d; must have %d <= num_grid_poitns <= %d.'
                % (self.num_grid_points, 2 * self.alpha, MAX_NUM_GRID_POINTS))

        # bounding_box
        if self.bounding_box is not None:

            # bounding_box is right type
            box_types = (list, tuple, np.ndarray)
            check(
                isinstance(self.bounding_box, box_types),
                'type(bounding_box) = %s; must be one of %s' %
                (type(self.bounding_box), box_types))

            # bounding_box has right length
            check(
                len(self.bounding_box) == 2,
                'len(bounding_box) = %d; must be %d' %
                (len(self.bounding_box), 2))

            # bounding_box entries must be numbers
            check(
                isinstance(self.bounding_box[0], numbers.Real)
                and isinstance(self.bounding_box[1], numbers.Real),
                'bounding_box = %s; entries must be numbers' %
                repr(self.bounding_box))

            # bounding_box entries must be sorted
            check(
                self.bounding_box[0] < self.bounding_box[1],
                'bounding_box = %s; entries must be sorted' %
                repr(self.bounding_box))

            # reset bounding_box as tuple
            self.bounding_box = (float(self.bounding_box[0]),
                                 float(self.bounding_box[1]))

        # periodic is bool
        check(isinstance(self.periodic, bool),
              'type(periodic) = %s; must be bool' % type(self.periodic))

        # evaluation_method_for_Z is valid
        Z_evals = ['Lap', 'Lap+Imp', 'Lap+Fey']
        check(
            self.Z_evaluation_method in Z_evals,
            'Z_eval = %s; must be in %s' % (self.Z_evaluation_method, Z_evals))

        # num_samples_for_Z is an integer
        check(
            isinstance(self.num_samples_for_Z, numbers.Integral),
            'type(self.num_samples_for_Z) = %s; ' %
            type(self.num_samples_for_Z) + 'must be integer.')
        self.num_samples_for_Z = int(self.num_samples_for_Z)

        # num_samples_for_Z is in range
        check(
            0 <= self.num_samples_for_Z <= MAX_NUM_SAMPLES_FOR_Z,
            'self.num_samples_for_Z = %d; ' % self.num_samples_for_Z +
            ' must satisfy 0 <= num_samples_for_Z <= %d.' %
            MAX_NUM_SAMPLES_FOR_Z)

        # max_t_step is a number
        check(
            isinstance(self.max_t_step, numbers.Real),
            'type(max_t_step) = %s; must be a number' % type(self.max_t_step))

        # max_t_step is positive
        check(self.max_t_step > 0,
              'maxt_t_step = %f; must be > 0.' % self.max_t_step)

        # print_t is bool
        check(isinstance(self.print_t, bool),
              'type(print_t) = %s; must be bool.' % type(self.print_t))

        # tolerance is float
        check(isinstance(self.tolerance, numbers.Real),
              'type(tolerance) = %s; must be number' % type(self.tolerance))

        # tolerance is positive
        check(self.tolerance > 0,
              'tolerance = %f; must be > 0' % self.tolerance)

        # resolution is number
        check(isinstance(self.resolution, numbers.Real),
              'type(resolution) = %s; must be number' % type(self.resolution))

        # resolution is positive
        check(self.resolution > 0,
              'resolution = %f; must be > 0' % self.resolution)

        if self.seed is not None:

            # seed is int
            check(isinstance(self.seed, int),
                  'type(seed) = %s; must be int' % type(self.seed))

            # seed is in range
            check(0 <= self.seed <= 2**32 - 1,
                  'seed = %d; must have 0 <= seed <= 2**32 - 1' % self.seed)

        # sample_only_at_l_star is bool
        check(
            isinstance(self.sample_only_at_l_star, bool),
            'type(sample_only_at_l_star) = %s; must be bool.' %
            type(self.sample_only_at_l_star))

        # num_posterior_samples is int
        check(
            isinstance(self.num_posterior_samples, numbers.Integral),
            'type(num_posterior_samples) = %s; must be integer' %
            type(self.num_posterior_samples))
        self.num_posterior_samples = int(self.num_posterior_samples)

        # num_posterior_samples is nonnegative
        check(
            0 <= self.num_posterior_samples <= MAX_NUM_POSTERIOR_SAMPLES,
            'num_posterior_samples = %f; need ' % self.num_posterior_samples +
            '0 <= num_posterior_samples <= %d.' % MAX_NUM_POSTERIOR_SAMPLES)

        # max_log_evidence_ratio_drop is number
        check(
            isinstance(self.max_log_evidence_ratio_drop, numbers.Real),
            'type(max_log_evidence_ratio_drop) = %s; must be number' %
            type(self.max_log_evidence_ratio_drop))

        # max_log_evidence_ratio_drop is positive
        check(
            self.max_log_evidence_ratio_drop > 0,
            'max_log_evidence_ratio_drop = %f; must be > 0' %
            self.max_log_evidence_ratio_drop)
Beispiel #5
0
    def _run(self):
        """
        Estimates the probability density from data using the DEFT algorithm.
        Also samples posterior densities
        """

        # Extract information from Deft1D object
        data = self.data
        G = self.num_grid_points
        h = self.grid_spacing
        alpha = self.alpha
        periodic = self.periodic
        Z_eval = self.Z_evaluation_method
        num_Z_samples = self.num_samples_for_Z
        DT_MAX = self.max_t_step
        print_t = self.print_t
        tollerance = self.tolerance
        resolution = self.resolution
        deft_seed = self.seed
        num_pt_samples = self.num_posterior_samples
        fix_t_at_t_star = self.sample_only_at_l_star
        max_log_evidence_ratio_drop = self.max_log_evidence_ratio_drop

        # Start clock
        start_time = time.clock()

        # If deft_seed is specified, set it
        if not (deft_seed is None):
            np.random.seed(deft_seed)
        else:
            np.random.seed(None)

        # Create Laplacian
        laplacian_start_time = time.clock()
        if periodic:
            op_type = '1d_periodic'
        else:
            op_type = '1d_bilateral'
        Delta = laplacian.Laplacian(op_type, alpha, G)
        laplacian_compute_time = time.clock() - laplacian_start_time
        if print_t:
            print('Laplacian computed de novo in %f sec.' %
                  laplacian_compute_time)

        # Get histogram counts and grid centers

        # Histogram based on bin centers
        counts, _ = np.histogram(data, self.bin_edges)
        N = sum(counts)

        # Make sure a sufficient number of bins are nonzero
        num_nonempty_bins = sum(counts > 0)
        check(
            num_nonempty_bins > self.alpha,
            'Histogram has %d nonempty bins; must be > %d.' %
            (num_nonempty_bins, self.alpha))

        # Compute initial t
        t_start = min(0.0, sp.log(N) - 2.0 * alpha * sp.log(alpha / h))
        if print_t:
            print('t_start = %0.2f' % t_start)

        # Do DEFT density estimation
        core_results = deft_core.run(counts, Delta, Z_eval, num_Z_samples,
                                     t_start, DT_MAX, print_t, tollerance,
                                     resolution, num_pt_samples,
                                     fix_t_at_t_star,
                                     max_log_evidence_ratio_drop)

        # Fill in results
        results = core_results  # Get all results from deft_core

        # Normalize densities properly
        results.h = h
        results.L = G * h
        results.R /= h
        results.M /= h
        results.Q_star /= h
        results.l_star = h * (sp.exp(-results.t_star) * N)**(1 / (2. * alpha))
        for p in results.map_curve.points:
            p.Q /= h
        if not (num_pt_samples == 0):
            results.Q_samples /= h
        results.Delta = Delta

        # Store results
        self.results = results
Beispiel #6
0
    def get_stats(self, use_weights=True, show_samples=False):
        """
        Computes summary statistics for the estimated density

        parameters
        ----------

        show_samples: (bool)
            If True, summary stats are computed for each posterior sample.
            If False, summary stats are returned for the "star" estimate,
            the histogram, and the maxent estimate, along with the mean and
            RMSD values of these stats across posterior samples.

        use_weights: (bool)
            If True, mean and RMSD are computed using importance weights.

        returns
        -------

        df: (pd.DataFrame)
            A pandas data frame listing summary statistics for the estimated
            probability densities. These summary statistics include
            "entropy" (in bits), "mean", "variance", "skewness", and
            "kurtosis". If ``show_samples = False``, results will be shown for
            the best estimate, as well as mean and RMDS values across all
            samples. If ``show_samples = True``, results will be shown for
            each sample. A column showing column weights will also be included.
        """

        # Check inputs
        check(isinstance(use_weights, bool),
              'use_weights = %s; must be True or False.' % use_weights)
        check(isinstance(show_samples, bool),
              'show_samples = %s; must be True or False.' % show_samples)

        # Define a function for each summary statistic
        def entropy(Q):
            h = self.grid_spacing
            eps = 1E-10
            assert (all(Q >= 0))
            return np.sum(h * Q * np.log2(Q + eps))

        def mean(Q):
            x = self.grid
            h = self.grid_spacing
            return np.sum(h * Q * x)

        def variance(Q):
            mu = mean(Q)
            x = self.grid
            h = self.grid_spacing
            return np.sum(h * Q * (x - mu)**2)

        def skewness(Q):
            mu = mean(Q)
            x = self.grid
            h = self.grid_spacing
            return np.sum(h * Q * (x - mu)**3) / np.sum(h * Q *
                                                        (x - mu)**2)**(3 / 2)

        def kurtosis(Q):
            mu = mean(Q)
            x = self.grid
            h = self.grid_spacing
            return np.sum(h * Q * (x - mu)**4) / np.sum(h * Q * (x - mu)**2)**2

        # Index functions by their names and set these as columns
        col2func_dict = {
            'entropy': entropy,
            'mean': mean,
            'variance': variance,
            'skewness': skewness,
            'kurtosis': kurtosis
        }
        cols = list(col2func_dict.keys())
        if show_samples:
            cols += ['weight']

        # Create list of row names
        if show_samples:
            rows = ['sample %d' % n for n in range(self.num_posterior_samples)]
        else:
            rows = [
                'star', 'histogram', 'maxent', 'posterior mean',
                'posterior RMSD'
            ]

        # Initialize data frame
        df = pd.DataFrame(columns=cols, index=rows)

        # Set sample weights
        if use_weights:
            ws = self.sample_weights
        else:
            ws = np.ones(self.num_posterior_samples)

        # Fill in data frame column by column
        for col_num, col in enumerate(cols):

            # If listing weights, do so
            if col == 'weight':
                df.loc[:, col] = ws

            # If computing a summary statistic
            else:

                # Get summary statistic function
                func = col2func_dict[col]

                # Compute func value for each sample
                ys = np.zeros(self.num_posterior_samples)
                for n in range(self.num_posterior_samples):
                    ys[n] = func(self.sample_values[:, n])

                # If recording individual results for all samples, do so
                if show_samples:
                    df.loc[:, col] = ys

                # Otherwise, record individual entries
                else:
                    # Fill in func value for start density
                    df.loc['star', col] = func(self.values)

                    # Fill in func value for histogram
                    df.loc['histogram', col] = func(self.histogram)

                    # Fill in func value for maxent point
                    df.loc['maxent', col] = func(self.maxent)

                    # Record mean and rmsd values across samples
                    mu = np.sum(ys * ws) / np.sum(ws)
                    df.loc['posterior mean', col] = mu
                    df.loc['posterior RMSD', col] = np.sqrt(
                        np.sum(ws * (ys - mu)**2) / np.sum(ws))

        # Return data frame to user
        return df
Beispiel #7
0
    def evaluate_samples(self, x, resample=True):
        """
        Evaluate sampled densities at specified locations.

        parameters
        ----------

        x: (number or list-like collection of numbers)
            The locations in the data domain at which to evaluate sampled
            density.

        resample: (bool)
            Whether to use importance resampling, i.e., should the values
            returned be from the original samples (obtained using a Laplace
            approximated posterior) or should they be resampled to
            account for the deviation between the true Bayesian posterior
            and its Laplace approximation.

        returns
        -------

        A 1D np.array (if x is a number) or a 2D np.array (if x is list-like),
        representing the values of the posterior sampled densities at the
        specified locations. The first index corresponds to values in x, the
        second to sampled densities.
        """

        # Clean input
        x_arr, is_number = clean_numerical_input(x)

        # Check resample type
        check(isinstance(resample, bool),
              'type(resample) = %s. Must be bool.' % type(resample))

        # Make sure that posterior samples were taken
        check(
            self.num_posterior_samples > 0,
            'Cannot evaluate samples because no posterior samples'
            'have been computed.')

        assert (len(self.sample_density_funcs) == self.num_posterior_samples)

        # Evaluate all sampled densities at x
        values = np.array(
            [d.evaluate(x_arr) for d in self.sample_density_funcs]).T

        # If requested, resample columns of values array based on
        # sample weights
        if resample:
            probs = self.sample_weights / self.sample_weights.sum()
            old_cols = np.array(range(self.num_posterior_samples))
            new_cols = np.random.choice(old_cols,
                                        size=self.num_posterior_samples,
                                        replace=True,
                                        p=probs)
            values = values[:, new_cols]

        # If number was passed as input, return 1D np.array
        if is_number:
            values = values.ravel()

        return values
Beispiel #8
0
    def _set_grid(self):
        """
        Sets the grid based on user input
        """

        data = self.data
        grid = self.grid
        grid_spacing = self.grid_spacing
        num_grid_points = self.num_grid_points
        bounding_box = self.bounding_box
        alpha = self.alpha

        # If grid is specified
        if grid is not None:

            # Check and set number of grid points
            num_grid_points = len(grid)
            assert (num_grid_points >= 2 * alpha)

            # Check and set grid spacing
            diffs = np.diff(grid)
            grid_spacing = diffs.mean()
            assert (grid_spacing > 0)
            assert (all(np.isclose(diffs, grid_spacing)))

            # Check and set grid bounds
            grid_padding = grid_spacing / 2
            lower_bound = grid[0] - grid_padding
            upper_bound = grid[-1] + grid_padding
            bounding_box = np.array([lower_bound, upper_bound])
            box_size = upper_bound - lower_bound

        # If grid is not specified
        if grid is None:

            ### First, set bounding box ###

            # If bounding box is specified, use that.
            if bounding_box is not None:
                assert bounding_box[0] < bounding_box[1]
                lower_bound = bounding_box[0]
                upper_bound = bounding_box[1]
                box_size = upper_bound - lower_bound

            # Otherwise set bounding box based on data
            else:
                assert isinstance(data, np.ndarray)
                assert all(np.isfinite(data))
                assert min(data) < max(data)

                # Choose bounding box to encapsulate all data, with extra room
                data_max = max(data)
                data_min = min(data)
                data_span = data_max - data_min
                lower_bound = data_min - .2 * data_span
                upper_bound = data_max + .2 * data_span

                # Autoadjust lower bound
                if data_min >= 0 and lower_bound < 0:
                    lower_bound = 0

                # Autoadjust upper bound
                if data_max <= 0 and upper_bound > 0:
                    upper_bound = 0
                if data_max <= 1 and upper_bound > 1:
                    upper_bound = 1
                if data_max <= 100 and upper_bound > 100:
                    upper_bound = 100

                # Extend bounding box outward a little for numerical safety
                lower_bound -= SMALL_NUM * data_span
                upper_bound += SMALL_NUM * data_span
                box_size = upper_bound - lower_bound

                # Set bounding box
                bounding_box = np.array([lower_bound, upper_bound])

            ### Next, define grid based on bounding box ###

            # If grid_spacing is specified
            if (grid_spacing is not None):
                assert isinstance(grid_spacing, float)
                assert np.isfinite(grid_spacing)
                assert grid_spacing > 0

                # Set number of grid points
                num_grid_points = np.floor(box_size / grid_spacing).astype(int)

                # Check num_grid_points isn't too small
                check(
                    2 * self.alpha <= num_grid_points,
                    'Using grid_spacing = %f ' % grid_spacing +
                    'produces num_grid_points = %d, ' % num_grid_points +
                    'which is too small. Reduce grid_spacing or do not set.')

                # Check num_grid_points isn't too large
                check(
                    num_grid_points <= MAX_NUM_GRID_POINTS,
                    'Using grid_spacing = %f ' % grid_spacing +
                    'produces num_grid_points = %d, ' % num_grid_points +
                    'which is too big. Increase grid_spacing or do not set.')

                # Define grid padding
                # Note: grid_spacing/2 <= grid_padding < grid_spacing
                grid_padding = (box_size -
                                (num_grid_points - 1) * grid_spacing) / 2
                assert (grid_spacing / 2 <= grid_padding < grid_spacing)

                # Define grid to be centered in bounding box
                grid_start = lower_bound + grid_padding
                grid_stop = upper_bound - grid_padding
                grid = np.linspace(
                    grid_start,
                    grid_stop * (1 + SMALL_NUM),  # For safety
                    num_grid_points)

            # Otherwise, if num_grid_points is specified
            elif (num_grid_points is not None):
                assert isinstance(num_grid_points, int)
                assert 2 * alpha <= num_grid_points <= MAX_NUM_GRID_POINTS

                # Set grid spacing
                grid_spacing = box_size / num_grid_points

                # Define grid padding
                grid_padding = grid_spacing / 2

                # Define grid to be centered in bounding box
                grid_start = lower_bound + grid_padding
                grid_stop = upper_bound - grid_padding
                grid = np.linspace(
                    grid_start,
                    grid_stop * (1 + SMALL_NUM),  # For safety
                    num_grid_points)

            # Otherwise, set grid_spacing and num_grid_points based on data
            else:
                assert isinstance(data, np.ndarray)
                assert all(np.isfinite(data))
                assert min(data) < max(data)

                # Compute default grid spacing
                default_grid_spacing = box_size / DEFAULT_NUM_GRID_POINTS

                # Set minimum number of grid points
                min_num_grid_points = 2 * alpha

                # Set minimum grid spacing
                data.sort()
                diffs = np.diff(data)
                min_grid_spacing = min(diffs[diffs > 0])
                min_grid_spacing = min(min_grid_spacing,
                                       box_size / min_num_grid_points)

                # Set grid_spacing
                grid_spacing = max(min_grid_spacing, default_grid_spacing)

                # Set number of grid points
                num_grid_points = np.floor(box_size / grid_spacing).astype(int)

                # Set grid padding
                grid_padding = grid_spacing / 2

                # Define grid to be centered in bounding box
                grid_start = lower_bound + grid_padding
                grid_stop = upper_bound - grid_padding
                grid = np.linspace(
                    grid_start,
                    grid_stop * (1 + SMALL_NUM),  # For safety
                    num_grid_points)

        # Set final grid
        self.grid = grid
        self.grid_spacing = grid_spacing
        self.grid_padding = grid_padding
        self.num_grid_points = num_grid_points
        self.bounding_box = bounding_box
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        self.box_size = box_size

        # Make sure that the final number of gridpoints is ok.
        check(
            2 * self.alpha <= self.num_grid_points <= MAX_NUM_GRID_POINTS,
            'After setting grid, we find that num_grid_points = %d; must have %d <= len(grid) <= %d. '
            % (self.num_grid_points, 2 * self.alpha, MAX_NUM_GRID_POINTS) +
            'Something is wrong with input values of grid, grid_spacing, num_grid_points, or bounding_box.'
        )

        # Set bin edges
        self.bin_edges = np.concatenate(
            ([lower_bound], grid[:-1] + grid_spacing / 2, [upper_bound]))
Beispiel #9
0
    def __init__(self,
                 distribution='gaussian',
                 num_data_points=100,
                 seed=None):

        # Check that distribution is valid
        check(distribution in self.list(),
              'distribution = %s is not valid' % distribution)

        # Check num_data_points is integral
        check(isinstance(num_data_points, numbers.Integral),
              'num_data_points = %s is not an integer.' % num_data_points)

        # Cast num_data_points as an integer
        num_data_points = int(num_data_points)

        # Check value
        check(
            0 < num_data_points <= MAX_DATASET_SIZE,
            'num_data_points = %d; must have 0 < num_data_points <= %d.' %
            (num_data_points, MAX_DATASET_SIZE))

        # Run seed and catch errors
        try:
            np.random.seed(seed)
        except TypeError:
            raise ControlledError('type(seed) = %s; invalid type.' %
                                  type(seed))
        except ValueError:
            raise ControlledError('seed = %s; invalid value.' % seed)

        # Set default value for periodic
        periodic = False

        # If gaussian distribution
        if distribution == 'gaussian':
            description = 'Gaussian distribution'
            mus = [0.]
            sigmas = [1.]
            weights = [1.]
            bounding_box = [-5, 5]
            data, pdf_py, pdf_js = gaussian_mixture(num_data_points, weights,
                                                    mus, sigmas, bounding_box)

        # If mixture of two gaussian distributions
        elif distribution == 'narrow':
            description = 'Gaussian mixture, narrow separation'
            mus = [-1.25, 1.25]
            sigmas = [1., 1.]
            weights = [1., 1.]
            bounding_box = [-6, 6]
            data, pdf_py, pdf_js = gaussian_mixture(num_data_points, weights,
                                                    mus, sigmas, bounding_box)

        # If mixture of two gaussian distributions
        elif distribution == 'wide':
            description = 'Gaussian mixture, wide separation'
            mus = [-2.0, 2.0]
            sigmas = [1.0, 1.0]
            weights = [1.0, 0.5]
            bounding_box = [-6.0, 6.0]
            data, pdf_py, pdf_js = gaussian_mixture(num_data_points, weights,
                                                    mus, sigmas, bounding_box)

        elif distribution == 'foothills':
            description = 'Foothills (Gaussian mixture)'
            mus = [0., 5., 8., 10, 11]
            sigmas = [2., 1., 0.5, 0.25, 0.125]
            weights = [1., 1., 1., 1., 1.]
            bounding_box = [-5, 12]
            data, pdf_py, pdf_js = gaussian_mixture(num_data_points, weights,
                                                    mus, sigmas, bounding_box)

        elif distribution == 'accordian':
            description = 'Accordian (Gaussian mixture)'
            mus = [0., 5., 8., 10, 11, 11.5]
            sigmas = [2., 1., 0.5, 0.25, 0.125, 0.0625]
            weights = [16., 8., 4., 2., 1., 0.5]
            bounding_box = [-5, 13]
            data, pdf_py, pdf_js = gaussian_mixture(num_data_points, weights,
                                                    mus, sigmas, bounding_box)

        elif distribution == 'goalposts':
            description = 'Goalposts (Gaussian mixture)'
            mus = [-20, 20]
            sigmas = [1., 1.]
            weights = [1., 1.]
            bounding_box = [-25, 25]
            data, pdf_py, pdf_js = gaussian_mixture(num_data_points, weights,
                                                    mus, sigmas, bounding_box)

        elif distribution == 'towers':
            description = 'Towers (Gaussian mixture)'
            mus = [-20, -15, -10, -5, 0, 5, 10, 15, 20]
            sigmas = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
            weights = [1., 1., 1., 1., 1., 1., 1., 1., 1.]
            bounding_box = [-25, 25]
            data, pdf_py, pdf_js = gaussian_mixture(num_data_points, weights,
                                                    mus, sigmas, bounding_box)

        # If uniform distribution
        elif distribution == 'uniform':
            data = stats.uniform.rvs(size=num_data_points)
            bounding_box = [0, 1]
            description = 'Uniform distribution'
            pdf_js = "1.0"
            pdf_py = "1.0"

        # Convex beta distribution
        elif distribution == 'beta_convex':
            data = stats.beta.rvs(a=0.5, b=0.5, size=num_data_points)
            bounding_box = [0, 1]
            description = 'Convex beta distribtuion'
            pdf_js = "Math.pow(x,-0.5)*Math.pow(1-x,-0.5)*math.gamma(1)/(math.gamma(0.5)*math.gamma(0.5))"
            pdf_py = "np.power(x,-0.5)*np.power(1-x,-0.5)*math.gamma(1)/(math.gamma(0.5)*math.gamma(0.5))"

        # Concave beta distribution
        elif distribution == 'beta_concave':
            data = stats.beta.rvs(a=2, b=2, size=num_data_points)
            bounding_box = [0, 1]
            description = 'Concave beta distribution'
            pdf_js = "Math.pow(x,1)*Math.pow(1-x,1)*math.gamma(4)/(math.gamma(2)*math.gamma(2))"
            pdf_py = "np.power(x,1)*np.power(1-x,1)*math.gamma(4)/(math.gamma(2)*math.gamma(2))"

        # Exponential distribution
        elif distribution == 'exponential':
            data = stats.expon.rvs(size=num_data_points)
            bounding_box = [0, 5]
            description = 'Exponential distribution'
            pdf_js = "Math.exp(-x)"
            pdf_py = "np.exp(-x)"

        # Gamma distribution
        elif distribution == 'gamma':
            data = stats.gamma.rvs(a=3, size=num_data_points)
            bounding_box = [0, 10]
            description = 'Gamma distribution'
            pdf_js = "Math.pow(x,2)*Math.exp(-x)/math.gamma(3)"
            pdf_py = "np.power(x,2)*np.exp(-x)/math.gamma(3)"

        # Triangular distribution
        elif distribution == 'triangular':
            data = stats.triang.rvs(c=0.5, size=num_data_points)
            bounding_box = [0, 1]
            description = 'Triangular distribution'
            pdf_js = "2-4*Math.abs(x - 0.5)"
            pdf_py = "2-4*np.abs(x - 0.5)"

        # Laplace distribution
        elif distribution == 'laplace':
            data = stats.laplace.rvs(size=num_data_points)
            bounding_box = [-5, 5]
            description = "Laplace distribution"
            pdf_js = "0.5*Math.exp(- Math.abs(x))"
            pdf_py = "0.5*np.exp(- np.abs(x))"

        # von Misses distribution
        elif distribution == 'vonmises':
            data = stats.vonmises.rvs(1, size=num_data_points)
            bounding_box = [-3.14159, 3.14159]
            periodic = True
            description = 'von Mises distribution'
            pdf_js = "Math.exp(Math.cos(x))/7.95493"
            pdf_py = "np.exp(np.cos(x))/7.95493"

        else:
            raise ControlledError('Distribution type "%s" not recognized.' %
                                  distribution)

        # Set these
        attributes = {
            'data': data,
            'bounding_box': bounding_box,
            'distribution': distribution,
            'pdf_js': pdf_js,
            'pdf_py': pdf_py,
            'periodic': periodic
        }
        for key, value in attributes.items():
            setattr(self, key, value)