Beispiel #1
0
def insert_missing_values(df, percent_rows, random_state=None):
    """
    Inserts missing values into a data frame.

    :param df: data frame we're operating on
    :param percent_rows: the percentage of rows that should have a missing value.
    :param random_state: the numpy RandomState
    :return: a df with missing values
    """
    # get the initialized random_state (if not already initialized)
    random_state = get_random_state(random_state)
    df = df.copy()

    def _insert_random_null(x):
        """
        Chose a random column in a df row to null. This
        operates in-place. But it's on the copy, so it should be OK.

        :param x: the data frame
        """
        # -1 because last col will always be y
        x[random_state.randint(0, len(x) - 1)] = np.nan
        return x

    # this is a "truthy" check. If it's zero or False, this will work.
    if not percent_rows:
        return df
    else:
        # otherwise validate that it's a float
        percent_rows = assert_valid_percent(percent_rows, eq_upper=True)  # eq_lower not necessary because != 0.
        sample_index = df.sample(frac=percent_rows, random_state=random_state).index  # random sample of rows to null
        df.loc[sample_index] = df.loc[sample_index].apply(_insert_random_null, axis=1)
        return df
Beispiel #2
0
 def __init__(self, n_classes, weights, n_samples, output_dir, random_state=None):
     self.ilsvrc_synsets = self.get_ilsvrc_1000_synsets()
     self.random_state = get_random_state(random_state)
     self.chosen_synsets = self.random_state.choice(self.ilsvrc_synsets, n_classes, replace=False)
     self.n_samples = n_samples
     self.output_dir = output_dir
     self.weights = weights
Beispiel #3
0
def create_regression_dataset(n_samples, n_features, n_informative, effective_rank, tail_strength,
                              noise, random_state=None):
    """
    Creates a regression dataset

    :param n_samples: number of observations
    :param n_features: number of features
    :param n_informative: number of informative features
    :param n_targets: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar.
    :param effective_rank: approximate number of singular vectors required to explain data
    :param tail_strength: relative importance of the fat noisy tail of the singular values profile
    :param noise: standard deviation of the gaussian noise applied to the output
    :param random_state: the numpy RandomState
    :return: the requested dataframe
    """
    random_state = get_random_state(random_state)
    X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                           n_targets=1, effective_rank=effective_rank, tail_strength=tail_strength,
                           noise=noise, random_state=random_state)

    # cast to a data frame
    df = pd.DataFrame(X)
    # rename X columns
    df = rename_columns(df)
    # and add the Y
    df['y'] = y
    return df
Beispiel #4
0
def create_classification_dataset(n_samples, n_features, n_informative, n_redundant, n_repeated,
                                  n_clusters_per_class, weights, n_classes, random_state=None):
    """
    Creates a binary classifier dataset

    :param n_samples: number of observations
    :param n_features: number of  features
    :param n_informative: number of informative features
    :param n_redundant: number of multicolinear
    :param n_repeated:  number of perfect collinear features
    :param n_clusters_per_class:  gaussian clusters per class
    :param weights: list of class balances, e.g. [.5, .5]
    :param n_classes: the number of class levels
    :param random_state: the numpy RandomState
    :return: the requested dataframe
    """
    random_state = get_random_state(random_state)
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative,
                               n_redundant=n_redundant, n_repeated=n_repeated,
                               n_clusters_per_class=n_clusters_per_class, weights=weights,
                               scale=(np.random.rand(n_features) * 10), n_classes=n_classes,
                               random_state=random_state)
    # cast to a data frame
    df = pd.DataFrame(X)
    # rename X columns
    df = rename_columns(df)
    # and add the Y
    df['y'] = y
    return df
Beispiel #5
0
def insert_special_char(character, df, random_state=None):
    """
    Chooses a column to reformat as currency or percentage, including a $ or % string, to make cleaning harder

    :param character: either $ or %
    :param df: the dataframe we're operating on
    :param random_state: the numpy RandomState
    :return: A dataframe with a single column chosen at random converted to a % or $ format
    """
    # get the initialized random_state (if not already initialized)
    random_state = get_random_state(random_state)
    df = df.copy()

    # choose a column at random, that isn't Y.  Only choose from numeric columns (no other eviled up columns)
    chosen_col = random_state.choice([col for col in df.select_dtypes(include=['number']).columns if col != 'y'])

    # assert that character is a string and that it's in ('$', '%')
    assert_is_type(character, six.string_types)
    if character not in ('$', '%'):
        raise ValueError('expected `character` to be in ("$", "%"), but got {0}'.format(character))

    # do scaling first:
    df[chosen_col] = (df[chosen_col] - df[chosen_col].mean()) / df[chosen_col].std()

    # do the specific div/mul operations
    if character is "$":
        # multiply by 1000, finally add a $
        df[chosen_col] = (df[chosen_col] * 1000).round(decimals=2).map(lambda x: "$" + str(x))
    else:  # elif character is "%":
        # divide by 100, finally add a $
        df[chosen_col] = (df[chosen_col] / 100).round(decimals=2).map(lambda x: str(x) + "%")

    return df
Beispiel #6
0
def create_categorical_features(df,
                                label_list,
                                random_state=None,
                                label_name='y'):
    """
    Creates random categorical variables

    :param df: data frame we're operation on
    :param label_list: A list of lists, each list is the labels for one categorical variable
    :param random_state: the numpy RandomState
    :param label_name: the column name of rht label, if any. Default is 'y'
    :return: A modified dataframe

    Example:

    create_categorical_features(df, [['a','b'], ['red','blue']])

    """
    random_state = get_random_state(random_state)

    df = df.copy()
    n_categorical = len(label_list)

    # get numeric columns ONCE so we don't have to do it every time we loop:
    numer_cols = [
        col for col in df.select_dtypes(include=['number']).columns
        if col != label_name
    ]

    for i in range(0, n_categorical):
        # we might be out of numerical columns!
        if not numer_cols:
            break

        # chose a random numeric column that isn't y
        chosen_col = random_state.choice(numer_cols)
        # pop the chosen_col out of the numer_cols
        numer_cols.pop(numer_cols.index(chosen_col))

        # use cut to convert that column to categorical
        df[chosen_col] = pd.cut(df[chosen_col],
                                bins=len(label_list[i]),
                                labels=label_list[i])

    return df
Beispiel #7
0
def make_image_dataset(config=None):
    if config is None:
        # called from the command line so parse configuration
        args = parse_args(sys.argv[1:])
        config = load_config(args['config'])

    random_state = get_random_state(config["random_seed"])

    if config["image_source"] == "imagenet":
        _ImageNet(n_classes=config["n_classes"],
                  weights=config["weights"],
                  n_samples=config["n_samples"],
                  output_dir=config["out_path"],
                  random_state=random_state).get_images()

    elif config["image_source"] == "openimages":
        print("Not yet supported. The only image_source currently supported is 'imagenet'")

    elif config["image_source"] == "googlesearch":
        print("Not yet supported. The only image_source currently supported is 'imagenet'")

    else:
        print(config["image_source"], "is not a supported image_source")
        print("The only image_source currently supported is 'imagenet'")
Beispiel #8
0
import shutil
from snape.make_image_dataset import *
from snape.make_image_dataset import _ImageNet, _ImageGrabber
from snape.utils import get_random_state
from nose.tools import assert_raises

conf = {
    "n_classes": 2,
    "n_samples": 11,
    "out_path": "./test_images/",
    "weights": [.8, .2],
    "image_source": "imagenet",
    "random_seed": 42
}

random_state = get_random_state(conf["random_seed"])


def test_make_image_dataset():
    os.mkdir(conf["out_path"])
    try:
        make_image_dataset(conf)
        sub_dir = conf["out_path"] + os.listdir(conf["out_path"])[0]
        print("SUBDIR:", sub_dir)
        n_images = len(os.listdir(sub_dir))
        class1_size = int(conf["n_samples"] * conf["weights"][0])
        assert class1_size == n_images, "Did not download n images"
        assert len(
            os.listdir(conf["out_path"])
        ) == conf["n_classes"], "Did not produce the specified # of classes"
    except:
Beispiel #9
0
def make_dataset(config=None):
    """
    Creates a machine learning dataset based on command line arguments passed

    :param config: a configuration dictionary, or None if called from the command line
    :return: None
    """

    if config is None:
        # called from the command line so parse configuration
        args = parse_args(sys.argv[1:])
        config = load_config(args['config'])

    print('-' * 80)
    c_type = config[
        'type']  # avoid multiple lookups - fails with key error if not present
    if c_type not in ('regression', 'classification'):
        raise ValueError(
            'type must be in ("regression", "classification"), but got %s' %
            c_type)
    reg = c_type == 'regression'

    # get defaults - these are the defaults from sklearn.
    def _safe_get_with_default(cfg, key, default):
        if key not in cfg:
            print("Warning: %s not in configuration, defaulting to %r" %
                  (key, default))
            return default
        return cfg[key]

    n_samples = _safe_get_with_default(config, 'n_samples', 100)
    n_features = _safe_get_with_default(
        config, 'n_features',
        20 if not reg else 100)  # diff defaults in sklearn
    n_informative = _safe_get_with_default(
        config, 'n_informative',
        2 if not reg else 10)  # diff defaults in sklearn
    n_redundant = _safe_get_with_default(config, 'n_redundant', 2)
    n_repeated = _safe_get_with_default(config, 'n_repeated', 0)
    n_clusters_per_class = _safe_get_with_default(config,
                                                  'n_clusters_per_class', 2)
    weights = _safe_get_with_default(config, 'weights', None)
    n_classes = _safe_get_with_default(config, 'n_classes', 2)
    effective_rank = _safe_get_with_default(config, 'effective_rank', None)
    tail_strength = _safe_get_with_default(config, 'tail_strength', 0.5)
    noise = _safe_get_with_default(config, 'noise', 0.)
    seed = _safe_get_with_default(config, 'random_seed', 42)
    shuffle = _safe_get_with_default(config, 'shuffle', True)

    # get the random state
    random_state = get_random_state(seed)

    # create the base dataset
    if not reg:
        print('Creating Classification Dataset...')
        df = create_classification_dataset(
            n_samples=n_samples,
            n_features=n_features,
            n_informative=n_informative,
            n_redundant=n_redundant,
            n_repeated=n_repeated,
            n_clusters_per_class=n_clusters_per_class,
            weights=weights,
            n_classes=n_classes,
            random_state=random_state,
            shuffle=shuffle)

    else:  # elif c_type == 'regression':
        print('Creating Regression Dataset...')
        df = create_regression_dataset(n_samples=n_samples,
                                       n_features=n_features,
                                       n_informative=n_informative,
                                       effective_rank=effective_rank,
                                       tail_strength=tail_strength,
                                       noise=noise,
                                       random_state=random_state,
                                       shuffle=shuffle)

    # make sure to use safe lookups to avoid KeyErrors!!!
    label_list = _safe_get_with_default(config, 'label_list', None)
    do_categorical = label_list is not None and len(label_list) > 0

    if do_categorical:
        print("Creating Categorical Features...")

        df = create_categorical_features(df,
                                         label_list,
                                         random_state=random_state)

    # insert entropy
    insert_dollar = _safe_get_with_default(config, 'insert_dollar', "No")
    insert_percent = _safe_get_with_default(config, 'insert_percent', "No")

    if any(entropy == "Yes" for entropy in (insert_dollar, insert_percent)):
        print("Inserting Requested Entropy...")

        # add $ or % column if requested
        if insert_dollar == "Yes":
            df = insert_special_char('$', df, random_state=random_state)
        if insert_percent == "Yes":
            df = insert_special_char('%', df, random_state=random_state)

    # insert missing values
    pct_missing = _safe_get_with_default(config, 'pct_missing', None)
    df = insert_missing_values(df, pct_missing, random_state=random_state)

    # Convert dataset to star schema if requested
    star_schema = _safe_get_with_default(config, 'star_schema', "No")
    outpath = _safe_get_with_default(config, 'out_path', "." + os.path.sep)
    if star_schema == "Yes":
        # Check the number of categorical variables
        if do_categorical:
            df = make_star_schema(df, outpath)
        else:
            print(
                "No categorical variables added. Dataset cannot be transformed into a star schema. "
                "Dataset will be generated as a single-table dataset...")

    print("Writing Train/Test Datasets")
    write_dataset(df, _safe_get_with_default(config, 'output', 'my_dataset'),
                  outpath)
Beispiel #10
0
from numpy.testing import assert_almost_equal
from snape.score_dataset import *
from snape.utils import get_random_state, assert_valid_percent
import os

random_state = get_random_state(42)
y_rand = (random_state.rand(200))

r = {'y': y_rand * 10, 'y_hat': y_rand * 10 - y_rand}

regression_df = pd.DataFrame(r)
c = {
    'y': [1, 1, 1, 1, 0, 0, 0, 0],
    'y_hat': [1, 0.9, 0.4, 0.95, 0, 0.1, 0.6, 0.15]
}

classification_df = pd.DataFrame(c)
m = {
    'y': [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3],
    'y_hat': [0, 1, 0, 1, 1, 3, 1, 2, 2, 3, 2, 3]
}

multiclass_df = pd.DataFrame(m)


def test_guess_problem_type():
    assert guess_problem_type(regression_df['y']), 'regression'
    assert guess_problem_type(classification_df['y']), 'binary'
    assert guess_problem_type(multiclass_df['y']), 'multiclass'

Beispiel #11
0
def test_random_state_fails(x):
    with pytest.raises(TypeError):
        get_random_state(x)