import pandas as pd from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ import tests.helpers as th import numpy as np import category_encoders as encoders np_X = th.create_array(n_rows=100) np_X_t = th.create_array(n_rows=50, extras=True) np_y = np.random.randn(np_X.shape[0]) > 0.5 np_y_t = np.random.randn(np_X_t.shape[0]) > 0.5 X = th.create_dataset(n_rows=100) X_t = th.create_dataset(n_rows=50, extras=True) y = pd.DataFrame(np_y) y_t = pd.DataFrame(np_y_t) class TestWeightOfEvidenceEncoder(TestCase): def test_woe(self): cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical', 'categorical_int' ] # balanced label with balanced features X_balanced = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col1']) y_balanced = [True, False, True, False, True, False] enc = encoders.WOEEncoder() enc.fit(X_balanced, y_balanced) X1 = enc.transform(X_balanced)
# loop times of benchmarking in every encoding, larger for more accurate but longer benchmarking time benchmark_repeat = 3 # sample num of data data_lines = 10000 # benchmarking result format result_cols = [ 'encoder', 'used_processes', 'X_shape', 'min_time(s)', 'average_time(s)', 'max_cpu_utilization(%)', 'average_cpu_utilization(%)' ] results = [] cpu_utilization = multiprocessing.Manager().Queue() # define data_set np_X = th.create_array(n_rows=data_lines) np_y = np.random.randn(np_X.shape[0]) > 0.5 X = th.create_dataset(n_rows=data_lines) X_t = th.create_dataset(n_rows=int(data_lines / 2), extras=True) cols = [ 'unique_str', 'underscore', 'extra', 'none', 'invariant', 321, 'categorical', 'na_categorical' ] def get_cpu_utilization(): """ new process for recording cpu utilization record cpu utilization every [cpu_sampling_rate] second & calculate its mean value the value is the cpu utilization during every encoding