def test_dataframe_raises(self): df = pd.DataFrame({"A": ["a", "a", "b"]}, dtype="category") dpp.LabelEncoder().fit(df) # OK df["other"] = ["a", "b", "c"] with pytest.raises(ValueError): dpp.LabelEncoder().fit(df)
def test_unseen_raises_array(self): enc = dpp.LabelEncoder().fit(y) new = da.from_array(np.array(["a", "a", "z"]), chunks=2) result = enc.transform(new) with pytest.raises(ValueError): result.compute()
def test_categorical(self, categories, transformed, daskify, ordered): cat = pd.Series( ["a", "b", "a"], dtype=pd.api.types.CategoricalDtype(categories=categories, ordered=ordered), ) if daskify: cat = dd.from_pandas(cat, npartitions=2) transformed = da.from_array(transformed, chunks=(2, 1)) if daskify == "unknown": cat = cat.cat.as_unknown() a = dpp.LabelEncoder().fit(cat) if daskify != "unknown": assert a.dtype_ == cat.dtype np.testing.assert_array_equal(a.classes_, categories) result = a.transform(cat) da.utils.assert_eq(result, transformed) inv_transformed = a.inverse_transform(result) if daskify: # manually set the divisions for the test inv_transformed.divisions = (0, 2) dd.utils.assert_eq(inv_transformed, cat)
def build_bow_model(training, testing): vectorizer = CountVectorizer() encoder = preprocessing.LabelEncoder() print("Converting to Dask Databags...") X_train_db = db.from_sequence(training['X_trn'], npartitions=NUMBER_OF_CPU) X_test_db = db.from_sequence(testing['X_tst'], npartitions=NUMBER_OF_CPU) print("Building BoW...") X_model = vectorizer.fit(X_train_db) X_train = X_model.transform(X_train_db) X_test = X_model.transform(X_test_db) print("Indexing strings...") y_model = encoder.fit(training['y_trn']) y_train = y_model.transform(training['y_trn']) y_test = y_model.transform(testing['y_tst']) print("Computing chunks...") compute_chunks(X_train, y_train, X_test, y_test) print("Re-convert to Dask Array") Xtrain, Xtest = convert_X_data(X_train, X_test) return Xtrain, y_train, Xtest, y_test
def main(): # client = Client("tcp://127.0.0.1:64958") client = Client(processes=False, threads_per_worker=2, n_workers=1, memory_limit='4GB') print(client) rs = RandomSearcher(get_space_num_cat_pipeline_complex, optimize_direction=OptimizeDirection.Maximize) hk = HyperGBM(rs, task='classification', reward_metric='accuracy', cache_dir=f'{test_output_dir}/hypergbm_cache', callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')]) df = dsutils.load_bank_by_dask() df.drop(['id'], axis=1) df['y'] = dm_pre.LabelEncoder().fit_transform(df['y']) # df = df.sample(frac=0.1) # object_columns = [i for i, v in df.dtypes.items() if v == 'object'] # for c in object_columns: # df[c] = df[c].astype('category') # df = df.categorize(object_columns) X_train, X_test = train_test_split(df, test_size=0.8, random_state=42) y_train = X_train.pop('y') y_test = X_test.pop('y') hk.search(X_train, y_train, X_test, y_test, max_trails=50) print('-' * 30) best_trial = hk.get_best_trail() print(f'best_train:{best_trial}') estimator = hk.final_train(best_trial.space_sample, X_train, y_train) score = estimator.predict(X_test) result = estimator.evaluate(X_test, y_test, metrics=['accuracy', 'auc', 'logloss']) print(f'final result:{result}')
def compute_class_weight(class_weight, *, classes, y): if not DaskToolBox.is_dask_object(y): return sk_utils.class_weight.compute_class_weight(class_weight, classes=classes, y=y) y = DaskToolBox.make_chunk_size_known(y) if set(dask.compute(da.unique(y))[0]) - set(classes): raise ValueError( "classes should include all valid labels that can be in y") if class_weight == 'balanced': # Find the weight of each class as present in y. le = dm_pre.LabelEncoder() y_ind = le.fit_transform(y) # if not all(np.in1d(classes, le.classes_)): # raise ValueError("classes should have valid labels that are in y") # recip_freq = len(y) / (len(le.classes_) * # np.bincount(y_ind).astype(np.float64)) # weight = recip_freq[le.transform(classes)] y_shape, y_ind_bincount, le_classes_ = dask.compute( y.shape, da.bincount(y_ind), le.classes_) if not all(np.in1d(classes, le_classes_)): raise ValueError( "classes should have valid labels that are in y") recip_freq = y_shape[0] / (len(le_classes_) * y_ind_bincount.astype(np.float64)) weight = recip_freq[np.searchsorted(le_classes_, classes)] else: raise ValueError("Only class_weight == 'balanced' is supported.") return weight
def _fit_array(self, X, y=None): n_features = X.shape[1] for n in range(n_features): le = dm_pre.LabelEncoder() le.fit(X[:, n]) self.encoders[n] = le return self
def test_basic(self): a = dpp.LabelEncoder() b = spp.LabelEncoder() a.fit(y) b.fit(y.compute()) assert_estimator_equal(a, b)
def test_basic(self): a = dpp.LabelEncoder() b = spp.LabelEncoder() a.fit(y) b.fit(y.compute()) exclude = {"dtype_"} assert_estimator_equal(a, b, exclude=exclude)
def test_transform(self, array): a = dpp.LabelEncoder() b = spp.LabelEncoder() a.fit(array) b.fit(array.compute()) assert_eq_ar(a.transform(array).compute(), b.transform(array.compute()))
def test_input_types(self, dask_array, pandas_series): a = dpp.LabelEncoder() b = spp.LabelEncoder() assert_estimator_equal(a.fit(dask_array), b.fit(pandas_series)) assert_estimator_equal(a.fit(pandas_series), b.fit(pandas_series)) assert_estimator_equal(a.fit(pandas_series.values), b.fit(pandas_series)) assert_estimator_equal(a.fit(dask_array), b.fit(pandas_series.values))
def to_parquet(sales_series, file_name, processed_dir, LOG): LOG.debug('Setting index') sales_series = sales_series.set_index(sales_series['id']) LOG.debug('Setting index - done') encoders = {} # TODO: dask supposedly does this on its own with sensible defaults # sales_series['parquet_partition'] = np.random.randint(0, 100, sales_series.shape[0]) # this one is a dup of day_date_str which is harder to squeeze through the rest of the pipeline (yay petastorm) if 'day_date' in sales_series.columns: LOG.debug(f"Dropping 'day_date' from {sales_series.columns}") sales_series = sales_series.drop(['day_date'], axis=1) for col in sales_series.columns: if col in encoders: LOG.debug(f'Skipping: {col} - already encoded') continue # petastorm can't read these if str(sales_series[col].dtype) == 'uint8': sales_series[col] = sales_series[col].astype('int') if str(sales_series[col].dtype) in ['category', 'object']: LOG.debug(f'Encoding: {col}') enc = dask_preprocessing.LabelEncoder() #enc = LabelEncoder() sales_series[col] = enc.fit_transform(sales_series[col]) # TODO: update other transforms too! encoders[col] = enc for name, enc in encoders.items(): LOG.debug(f"Saving encoder: {name}") np.save(f'{processed_dir}/{name}.npy', enc.classes_) # TODO: uint -> int, category/object -> int, day_date -> drop # TODO: this is being called both on dask and pandas data frames and args are rather not compatible :/ parquet_file = f'{processed_dir}/{file_name}' LOG.debug(f"Saving {type(sales_series)} to {parquet_file}") kwargs = {} is_pandas_df = type(sales_series) == pd.DataFrame index_kwarg_name = 'index' if is_pandas_df else 'write_index' kwargs[index_kwarg_name] = False sales_series.to_parquet( parquet_file, **kwargs # partition_cols=['parquet_partition'] )
def test_use_categorical(self, daskify): data = pd.Series(["b", "c"], dtype=pd.api.types.CategoricalDtype(["c", "a", "b"])) if daskify: data = dd.from_pandas(data, npartitions=2) a = dpp.LabelEncoder(use_categorical=False).fit(data) b = spp.LabelEncoder().fit(data) assert_estimator_equal(a, b, exclude={"dtype_"}) assert a.dtype_ is None da.utils.assert_eq(a.transform(data), b.transform(data)) a_trn = a.transform(data) b_trn = b.transform(data) da.utils.assert_eq(a_trn, b_trn) da.utils.assert_eq(a.inverse_transform(a_trn), b.inverse_transform(b_trn))
def transform(data): for feature in cat_features: encoder = preprocessing.LabelEncoder() data[feature] = encoder.fit_transform(data[feature]) return encoder,data
def test_fit_transform_categorical(self): cat = dd.from_pandas(pd.Series(choices, dtype="category"), npartitions=4) result = dpp.LabelEncoder().fit_transform(cat) assert result.dtype == "int8" assert result.dtype == result.compute().dtype
def test_transform_dtypes(self, array): result = dpp.LabelEncoder().fit_transform(array) assert result.dtype == np.intp if dask.is_dask_collection(array): assert result.dtype == result.compute().dtype
import xgboost as xgb cluster = LocalCluster(n_workers=16, threads_per_worker=1) client = Client(cluster) d_train = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/train-1m.csv") d_test = pd.read_csv("https://s3.amazonaws.com/benchm-ml--main/test.csv") d_all = pd.concat([d_train,d_test]) dx_all = dd.from_pandas(d_all, npartitions=16) vars_cat = ["Month","DayofMonth","DayOfWeek","UniqueCarrier", "Origin", "Dest"] vars_num = ["DepTime","Distance"] for col in vars_cat: dx_all[col] = preprocessing.LabelEncoder().fit_transform(dx_all[col]) X_all = dx_all[vars_cat+vars_num].to_dask_array(lengths=True) y_all = da.where((dx_all["dep_delayed_15min"]=="Y").to_dask_array(lengths=True),1,0) X_train = X_all[0:d_train.shape[0],] y_train = y_all[0:d_train.shape[0]] X_test = X_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0]),] y_test = y_all[d_train.shape[0]:(d_train.shape[0]+d_test.shape[0])] X_train.persist() y_train.persist() client.has_what()
def test_inverse_transform(self, array): a = dpp.LabelEncoder() assert_eq_ar(a.inverse_transform(a.fit_transform(array)), da.asarray(array))
def transform(data): cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'] for feature in cat: encoder = preprocessing.LabelEncoder() data[feature] = encoder.fit_transform(data[feature]) return encoder,data