Esempio n. 1
0
def test_get_dummies(data):
    gdf = DataFrame({"x": data})
    pdf = pd.DataFrame({"x": data})

    encoded_expected = pd.get_dummies(pdf, prefix="test")
    encoded_actual = cudf.get_dummies(gdf, prefix="test")

    utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
    encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8)

    utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
Esempio n. 2
0
def test_get_dummies(data, index):
    gdf = DataFrame({"x": data}, index=index)
    pdf = pd.DataFrame({"x": data}, index=index)

    encoded_expected = pd.get_dummies(pdf, prefix="test")
    encoded_actual = cudf.get_dummies(gdf, prefix="test")

    utils.assert_eq(
        encoded_expected,
        encoded_actual,
        check_dtype=False if len(data) == 0 else True,
    )
    encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8)

    utils.assert_eq(
        encoded_expected,
        encoded_actual,
        check_dtype=False if len(data) == 0 else True,
    )
Esempio n. 3
0
def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na):
    pdf = pd.DataFrame({"a": [0, 1, np.nan]})
    df = DataFrame.from_pandas(pdf, nan_as_null=nan_as_null)

    expected = pd.get_dummies(pdf, dummy_na=dummy_na, columns=["a"])
    got = cudf.get_dummies(df, dummy_na=dummy_na, columns=["a"])

    if dummy_na and nan_as_null:
        got = got.rename(columns={"a_null": "a_nan"})[expected.columns]

    utils.assert_eq(expected, got)
Esempio n. 4
0
def test_onehot_get_dummies_multicol(n_cols):
    n_categories = 5
    data = dict(
        zip(ascii_lowercase, (np.arange(n_categories) for _ in range(n_cols))))

    gdf = cudf.DataFrame(data)
    pdf = pd.DataFrame(data)

    encoded_expected = pd.get_dummies(pdf, prefix="test")
    encoded_actual = cudf.get_dummies(gdf, prefix="test")

    utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
Esempio n. 5
0
def test_get_dummies_array_like(data, prefix_sep, prefix, dtype):
    expected = cudf.get_dummies(
        data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
    )
    if isinstance(data, (cudf.Series, cudf.BaseIndex)):
        pd_data = data.to_pandas()
    else:
        pd_data = data

    actual = pd.get_dummies(
        pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
    )
    utils.assert_eq(expected, actual)
Esempio n. 6
0
def test_get_dummies_with_nan():
    df = cudf.DataFrame(
        {"a": cudf.Series([1, 2, np.nan, None], nan_as_null=False)})
    expected = cudf.DataFrame(
        {
            "a_1.0": [1, 0, 0, 0],
            "a_2.0": [0, 1, 0, 0],
            "a_nan": [0, 0, 1, 0],
            "a_null": [0, 0, 0, 1],
        },
        dtype="uint8",
    )
    actual = cudf.get_dummies(df, dummy_na=True, columns=["a"])

    utils.assert_eq(expected, actual)
Esempio n. 7
0
def test_get_dummies_array_like_with_nan():
    ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False)
    expected = cudf.DataFrame(
        {
            "a_null": [0, 0, 0, 1, 0],
            "a_0.1": [1, 0, 0, 0, 0],
            "a_2.0": [0, 1, 0, 0, 0],
            "a_3.0": [0, 0, 1, 0, 0],
            "a_nan": [0, 0, 0, 0, 1],
        },
        dtype="uint8",
    )
    actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")

    utils.assert_eq(expected, actual)
Esempio n. 8
0
def test_get_dummies_prefix_sep(prefix, prefix_sep):
    data = {
        "first": ["1", "2", "3"],
        "second": ["abc", "def", "ghi"],
        "third": ["ji", "ji", "ji"],
    }

    gdf = DataFrame(data)
    pdf = pd.DataFrame(data)

    encoded_expected = pd.get_dummies(pdf,
                                      prefix=prefix,
                                      prefix_sep=prefix_sep)
    encoded_actual = cudf.get_dummies(gdf,
                                      prefix=prefix,
                                      prefix_sep=prefix_sep)

    utils.assert_eq(encoded_expected, encoded_actual, check_dtype=False)
    def one_hot_encoder(self, dummy_nas=None):
        """
      Takes the output_df and creates dummifies any features in ohe_feats list

      By default it won't dummy any NAs in the features but this can be tweaked to True to handle them
      Params:
      - dummy_nas = True/False (default to False), used to indicate if get_dummies will dummy NAs
      """

        # Check if dummy_nas if the default (None), if it is then set dummy_nas to False (i.e. don't dummy NAs)
        if dummy_nas == None:
            dummy_nas = False

        # Otherwise set to True (dummy NAs)
        else:
            dummy_nas = True

        self.output_df = cudf.get_dummies(self.output_df,
                                          columns=self.ohe_feats,
                                          dummy_na=dummy_nas)
Esempio n. 10
0
def basic_feature_engineering(train, test, gpu=False):
    """

    reads in a train and test set of data and processes as per the basic
    feature engineering example

    Args:
        train (dataframe): the training dataframe (should include TARGET)
        test (dataframe): the testing dataframe
        gpu (boolean): whether to use cudf or not

    Returns:
        train (dataframe): the processed train frame
        test (dataframe): the processed test frame
        train_target (dataframe): The training target column

    """

    if gpu:
        import cudf as dd
    else:
        import pandas as dd

    app_train_mis_values = see_percent_missing_values(train)
    df_app_train_miss_values = dd.DataFrame({
        'columns':
        app_train_mis_values.index,
        'missing percent':
        app_train_mis_values.values
    })

    if type(df_app_train_miss_values) == cudf.core.dataframe.DataFrame:
        drop_columns = df_app_train_miss_values[df_app_train_miss_values['missing percent'] \
                                        >= 40]['columns'].to_arrow().to_pylist()
    else:
        drop_columns = df_app_train_miss_values[df_app_train_miss_values['missing percent'] \
                                        >= 40]['columns'].tolist()

    train = train.drop(drop_columns, axis=1)
    test = test.drop(drop_columns, axis=1)
    train_target = train['TARGET']
    train = train.drop('TARGET', axis=1)
    # here we will use a basic dummy treatment
    # we merged the dataframes first because when we dummify
    # we could have some columns only in train or only in test. Merging first will prevent this
    unified = dd.concat([train, test])
    dummy_cols = unified.select_dtypes(['bool', 'O',
                                        'category']).columns.tolist()
    unified = dd.get_dummies(unified, columns=dummy_cols, dtype='int64')

    # XGB for pandas does not like Int64
    for col in unified.select_dtypes('Int64').columns.tolist():
        unified[col] = unified[col].fillna(int(unified[col].mean()))
        unified[col] = unified[col].astype('int64')

    for col in unified.isna().any()[unified.isna().any() ==
                                    True].index.to_arrow().tolist():
        unified[col] = unified[col].fillna(0)

    train = unified[0:307511]
    test = unified[307511:]

    return train, test, train_target
Esempio n. 11
0
def ohe_gpu():
    tmpdf = cudf.DataFrame()
    tmpdf['grade'] = loan_pdf['grade']
    x = ohe_gpu_df = cudf.get_dummies(tmpdf)