def test_label_encode_drop_one(): random.seed(0) np.random.seed(0) df = DataFrame() # initialize data frame df["cats"] = np.random.randint(7, size=10, dtype=np.int32) vals = list(df["cats"].unique()) # drop 1 randomly del vals[random.randrange(len(vals))] lab = dict(zip(vals, list(range(len(vals))))) # label encode series ncol = df["cats"].label_encoding(cats=vals, dtype="float32") arr = ncol.to_array() # verify labels of new column for i in range(arr.size): # assuming -1 is used for missing value np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1)) # label encode data frame df2 = df.label_encoding(column="cats", prefix="cats", cats=vals, dtype="float32") assert df2.columns[0] == "cats" assert df2.columns[1] == "cats_labels"
def test_label_encode_float_output(): random.seed(0) np.random.seed(0) df = DataFrame() # initialize data frame df['cats'] = arr = np.random.randint(7, size=10, dtype=np.int32) cats = [1, 2, 3, 4] encoder = {c: i for i, c in enumerate(cats)} df2 = df.label_encoding(column='cats', prefix='cats', cats=cats, dtype=np.float32, na_sentinel=np.nan) got = df2['cats_labels'].to_array(fillna='pandas') handcoded = np.array([encoder.get(v, np.nan) for v in arr]) np.testing.assert_equal(got, handcoded)
def test_label_encode(nelem, dtype): df = DataFrame() np.random.seed(0) # initialize data frame df["cats"] = _random(nelem, dtype) vals = df["cats"].unique() lab = dict(zip(vals, range(len(vals)))) # label encode series ncol = df["cats"].label_encoding(cats=vals) arr = ncol.to_array() # verify labels of new column for i in range(arr.size): np.testing.assert_equal(arr[i], lab.get(df.cats[i], None)) # label encode data frame df2 = df.label_encoding(column="cats", prefix="cats", cats=vals) assert df2.columns[0] == "cats" assert df2.columns[1] == "cats_labels"