Example #1
0
def sample_trivariate_xyz(size=1000, seed=42):
    """Sample from three dimensional toy dataset.

    The output is a DataFrame containing three columns:

    * ``x``: Beta distribution with a=0.1 and b=0.1
    * ``y``: Beta distribution with a=0.1 and b=0.5
    * ``z``: Normal distribution + 10 times ``y``

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.DataFrame:
            DataFrame with three columns, ``x``, ``y`` and ``z``.
    """
    with random_seed(seed):
        x = stats.beta.rvs(a=0.1, b=0.1, size=size)
        y = stats.beta.rvs(a=0.1, b=0.5, size=size)
        return pd.DataFrame({
            'x': x,
            'y': y,
            'z': np.random.normal(size=size) + y * 10
        })
Example #2
0
def sample_bivariate_age_income(size=1000, seed=42):
    """Sample from a bivariate toy dataset.

    This dataset contains two columns which correspond to the simulated age and
    income which are positively correlated with outliers.

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.DataFrame:
            DataFrame with two columns, ``age`` and ``income``.
    """
    with random_seed(seed):
        age = stats.beta.rvs(a=2.0, b=6.0, loc=18, scale=100, size=size)
        income = np.log(age) * 100
        income += np.random.normal(loc=np.log(age) / 100, scale=10, size=size)
        income[np.random.randint(0, 10, size=size) == 0] /= 1000

    return pd.DataFrame({
        "age": age,
        "income": income
    })
Example #3
0
def sample_univariate_beta(size=1000, seed=42):
    """Sample from a beta distribution with a=3 and b=1 and loc=4.

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.Series:
            Series with the sampled values.
    """
    with random_seed(seed):
        return pd.Series(stats.beta.rvs(a=3, b=1, loc=4, size=size))
Example #4
0
def sample_univariate_degenerate(size=1000, seed=42):
    """Sample from a degenerate distribution that only takes one random value.

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.Series:
            Series with the sampled values.
    """
    with random_seed(seed):
        return pd.Series(np.full(size, np.random.random()))
Example #5
0
def sample_univariate_exponential(size=1000, seed=42):
    """Sample from an exponential distribution at 3.0 with rate 1.0.

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.Series:
            Series with the sampled values.
    """
    with random_seed(seed):
        return pd.Series(np.random.exponential(size=size) + 3.0)
Example #6
0
def sample_univariate_normal(size=1000, seed=42):
    """Sample from a normal distribution with mean 1 and stdev 1.

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.Series:
            Series with the sampled values.
    """
    with random_seed(seed):
        return pd.Series(np.random.normal(size=size, loc=1.0))
Example #7
0
def sample_univariate_uniform(size=1000, seed=42):
    """Sample from a uniform distribution in [-1.0, 3.0].

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.Series:
            Series with the sampled values.
    """
    with random_seed(seed):
        return pd.Series(4.0 * np.random.random(size=size) - 1.0)
Example #8
0
def load_age_income(seed=42):
    """
    This dataset contains two columns which correspond to the simulated age and
    income which are positively correlated with outliers.
    """
    N = 500
    with random_seed(seed):
        age = stats.beta.rvs(a=2.0, b=6.0, loc=18, scale=100, size=N)
        income = np.log(age) * 100
        income += np.random.normal(loc=np.log(age) / 100, scale=10, size=N)
        income[np.random.randint(0, 10, size=N) == 0] /= 1000
    return pd.DataFrame({
        "age": age,
        "income": income
    })
Example #9
0
def sample_univariate_bernoulli(size=1000, seed=42):
    """Sample from a Bernoulli distribution with p=0.3.

    The distribution is built by sampling a uniform random and then setting
    0 or 1 depending on whether the value is above or below 0.3.

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.Series:
            Series with the sampled values.
    """
    with random_seed(seed):
        return pd.Series(np.random.random(size=size) < 0.3).astype(float)
Example #10
0
def load_three_dimensional(seed=42):
    """
    This dataset contains 6 columns, each of which corresponds to a different
    univariate distribution:

        bernoulli - a Bernoulli distribution with p=0.3
        bimodal - a mixture of two Gaussians at 0.0 and 10.0 with stdev=1
        uniform - a uniform distribution in [-1.0, 3.0]
        normal - a normal distribution at 1.0 with stdev=1
        constant - a constant value
        exponential - an exponential distribution at 3.0 with rate 1.0
    """
    data = np.zeros((1000, 3))
    with random_seed(seed):
        data[:, 0] = stats.beta.rvs(a=0.1, b=0.1, size=data.shape[0])
        data[:, 1] = stats.beta.rvs(a=0.1, b=0.5, size=data.shape[0])
        data[:, 2] = np.random.normal(size=data.shape[0]) + data[:, 1] * 10
    return pd.DataFrame(data, columns=["x", "y", "z"])
Example #11
0
def sample_univariate_bimodal(size=1000, seed=42):
    """Sample from a bimodal distribution which mixes two Gaussians at 0.0 and 10.0 with stdev=1.

    The distribution is built by sampling a standard normal and a normal with mean ``10``
    and then selecting one or the other based on a bernoulli distribution.

    Args:
        size (int):
            Amount of samples to generate. Defaults to 1000.
        seed (int):
            Random seed to use. Defaults to 42.

    Retruns:
        pandas.Series:
            Series with the sampled values.
    """
    with random_seed(seed):
        bernoulli = sample_univariate_bernoulli(size, seed)
        mode1 = np.random.normal(size=size) * bernoulli
        mode2 = np.random.normal(size=size, loc=10) * (1.0 - bernoulli)

        return pd.Series(mode1 + mode2)
Example #12
0
def load_diverse_univariates(seed=42):
    """
    This dataset contains 6 columns, each of which corresponds to a different
    univariate distribution:

        bernoulli - a Bernoulli distribution with p=0.3
        bimodal - a mixture of two Gaussians at 0.0 and 10.0 with stdev=1
        uniform - a uniform distribution in [-1.0, 3.0]
        normal - a normal distribution at 1.0 with stdev=1
        constant - a constant value
        exponential - an exponential distribution at 3.0 with rate 1.0
    """
    size = 1000
    df = pd.DataFrame()
    with random_seed(seed):
        df["bernoulli"] = (np.random.random(size=size) < 0.3).astype(float)
        df["bimodal"] = np.random.normal(size=size) * df["bernoulli"] + \
            np.random.normal(size=size, loc=10) * (1.0 - df["bernoulli"])
        df["uniform"] = 4.0 * np.random.random(size=size) - 1.0
        df["normal"] = np.random.normal(size=size, loc=1.0)
        df["constant"] = np.random.random()  # a single random number
        df["exponential"] = np.random.exponential(size=size) + 3.0
    return df