def test_transform_1d_behavior(): X = np.arange(4) est = KBinsDiscretizer(n_bins=2) assert_raises(ValueError, est.fit, X) est = KBinsDiscretizer(n_bins=2) est.fit(X.reshape(-1, 1)) assert_raises(ValueError, est.transform, X)
def test_transform_outside_fit_range(strategy): X = np.array([0, 1, 2, 3])[:, None] kbd = KBinsDiscretizer(n_bins=4, strategy=strategy, encode='ordinal') kbd.fit(X) X2 = np.array([-2, 5])[:, None] X2t = kbd.transform(X2) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(X2t.min(axis=0), [0])
def test_inverse_transform(strategy): X = np.random.RandomState(0).randn(100, 3) kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal') Xt = kbd.fit_transform(X) assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_) X2 = kbd.inverse_transform(Xt) X2t = kbd.fit_transform(X2) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(Xt, X2t)
def test_fit_transform_n_bins_array(strategy, expected): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal', strategy=strategy).fit(X) assert_array_equal(expected, est.transform(X)) # test the shape of bin_edges_ n_features = np.array(X).shape[1] assert est.bin_edges_.shape == (n_features, ) for bin_edges, n_bins in zip(est.bin_edges_, est.n_bins_): assert bin_edges.shape == (n_bins + 1, )
def test_percentile_numeric_stability(): X = np.array([0.05, 0.05, 0.95]).reshape(-1, 1) bin_edges = np.array([0.05, 0.23, 0.41, 0.59, 0.77, 0.95]) Xt = np.array([0, 0, 4]).reshape(-1, 1) kbd = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') msg = ("Bins whose width are too small (i.e., <= 1e-8) in feature 0 " "are removed. Consider decreasing the number of bins.") assert_warns_message(UserWarning, msg, kbd.fit, X) assert_array_almost_equal(kbd.bin_edges_[0], bin_edges) assert_array_almost_equal(kbd.transform(X), Xt)
def test_overwrite(): X = np.array([0, 1, 2, 3])[:, None] X_before = X.copy() est = KBinsDiscretizer(n_bins=3, encode="ordinal") Xt = est.fit_transform(X) assert_array_equal(X, X_before) Xt_before = Xt.copy() Xinv = est.inverse_transform(Xt) assert_array_equal(Xt, Xt_before) assert_array_equal(Xinv, np.array([[0.5], [1.5], [2.5], [2.5]]))
def test_nonuniform_strategies(strategy, expected_2bins, expected_3bins): X = np.array([0, 1, 2, 3, 9, 10]).reshape(-1, 1) # with 2 bins est = KBinsDiscretizer(n_bins=2, strategy=strategy, encode='ordinal') Xt = est.fit_transform(X) assert_array_equal(expected_2bins, Xt.ravel()) # with 3 bins est = KBinsDiscretizer(n_bins=3, strategy=strategy, encode='ordinal') Xt = est.fit_transform(X) assert_array_equal(expected_3bins, Xt.ravel())
def test_same_min_max(strategy): warnings.simplefilter("always") X = np.array([[1, -2], [1, -1], [1, 0], [1, 1]]) est = KBinsDiscretizer(strategy=strategy, n_bins=3, encode='ordinal') assert_warns_message(UserWarning, "Feature 0 is constant and will be replaced " "with 0.", est.fit, X) assert est.n_bins_[0] == 1 # replace the feature with zeros Xt = est.transform(X) assert_array_equal(Xt[:, 0], np.zeros(X.shape[0]))
def test_inverse_transform(strategy, encode): X = np.random.RandomState(0).randn(100, 3) kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) X2 = kbd.inverse_transform(Xt) X2t = kbd.fit_transform(X2) if encode == 'onehot': assert_array_equal(Xt.todense(), X2t.todense()) else: assert_array_equal(Xt, X2t) if 'onehot' in encode: Xt = kbd._encoder.inverse_transform(Xt) X2t = kbd._encoder.inverse_transform(X2t) assert_array_equal(Xt.max(axis=0) + 1, kbd.n_bins_) assert_array_equal(X2t.max(axis=0) + 1, kbd.n_bins_)
def test_encode_options(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal').fit(X) Xt_1 = est.transform(X) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot-dense').fit(X) Xt_2 = est.transform(X) assert not sp.issparse(Xt_2) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False) .fit_transform(Xt_1), Xt_2) assert_raise_message(ValueError, "inverse_transform only supports " "'encode = ordinal'. Got encode='onehot-dense' " "instead.", est.inverse_transform, Xt_2) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot').fit(X) Xt_3 = est.transform(X) assert sp.issparse(Xt_3) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True) .fit_transform(Xt_1).toarray(), Xt_3.toarray()) assert_raise_message(ValueError, "inverse_transform only supports " "'encode = ordinal'. Got encode='onehot' " "instead.", est.inverse_transform, Xt_2)
def test_encode_options(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='ordinal').fit(X) Xt_1 = est.transform(X) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot-dense').fit(X) Xt_2 = est.transform(X) assert not sp.issparse(Xt_2) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=False) .fit_transform(Xt_1), Xt_2) est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='onehot').fit(X) Xt_3 = est.transform(X) assert sp.issparse(Xt_3) assert_array_equal(OneHotEncoder( categories=[np.arange(i) for i in [2, 3, 3, 3]], sparse=True) .fit_transform(Xt_1).toarray(), Xt_3.toarray())
train = train[train['c_charge_degree'] != "O"] # We filtered the underlying data from Broward county to include only those rows representing people who had either # recidivated in two years, or had at least two years outside of a correctional facility. train = train[train['score_text'] != 'N/A'] train = train.replace('Medium', "Low") test = test.replace('Medium', "Low") train_labels = label_binarize(train['score_text'], classes=['High', 'Low']) test_labels = label_binarize(test['score_text'], classes=['High', 'Low']) impute_and_onehot = Pipeline([ ('imputer1', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) impute_and_bin = Pipeline([('imputer2', SimpleImputer(strategy='mean')), ('discretizer', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform'))]) compas_featurizer = ColumnTransformer( transformers=[('impute1_and_onehot', impute_and_onehot, ['is_recid']), ('impute2_and_bin', impute_and_bin, ['age'])]) compas_pipeline = Pipeline([('features', compas_featurizer), ('classifier', LogisticRegression())]) compas_pipeline.fit(train, train_labels.ravel()) print(compas_pipeline.score(test, test_labels.ravel()))
ax.set_title("Input data", size=14) xx, yy = np.meshgrid( np.linspace(X[:, 0].min(), X[:, 0].max(), 300), np.linspace(X[:, 1].min(), X[:, 1].max(), 300)) grid = np.c_[xx.ravel(), yy.ravel()] ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # transform the dataset with KBinsDiscretizer for strategy in strategies: enc = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy=strategy) enc.fit(X) grid_encoded = enc.transform(grid) ax = plt.subplot(len(X_list), len(strategies) + 1, i) # horizontal stripes horizontal = grid_encoded[:, 0].reshape(xx.shape) ax.contourf(xx, yy, horizontal, alpha=.5) # vertical stripes vertical = grid_encoded[:, 1].reshape(xx.shape) ax.contourf(xx, yy, vertical, alpha=.5) ax.scatter(X[:, 0], X[:, 1], edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max())
def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) Xinv = kbd.inverse_transform(Xt) assert_array_almost_equal(expected_inv, Xinv)
def bucketize_y(y, num_buckets): binner = KBinsDiscretizer(n_bins=num_buckets, encode='ordinal') cols = y.columns y = binner.fit_transform(y) y = pd.DataFrame(data=y, columns=cols) return y
for cat in categorical_features: numerical_columns.remove(cat) for tcat in time_columns: numerical_columns.remove(tcat) for bcat in bin_features: numerical_columns.remove(bcat) transformed_cols = [f"{col}_transformed" for col in time_columns] timefeat = FunctionTransformer(func=make_time_features, kw_args=dict(columns=time_columns), validate=False) # droptimefeat=FunctionTransformer(func=drop_time_columns,kw_args=dict(columns=time_columns),validate=False) numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())]) bin_transformer = Pipeline(steps=[("bin_transformer", KBinsDiscretizer())]) categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) print(time_columns) print(time_columns + transformed_cols) preprocessor = ColumnTransformer(transformers=[ ('timefeat', timefeat, time_columns), ('num', numeric_transformer, numerical_columns), ('cat', categorical_transformer, categorical_features), ('bin', bin_transformer, bin_features), ]) train1 = preprocessor.fit_transform(train.iloc[:, :-1]) test1 = preprocessor.transform(test)
def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2): """ a example about creating criteo dataset :param file: dataset's path :param embed_dim: the embedding dimension of sparse features :param read_part: whether to read part of it :param sample_num: the number of instances if read_part is True :param test_size: ratio of test dataset :return: feature columns, train, test """ names = [ 'label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26' ] if read_part: data_df = pd.read_csv(file, sep='\t', iterator=True, header=None, names=names) data_df = data_df.get_chunk(sample_num) else: data_df = pd.read_csv(file, sep='\t', header=None, names=names) sparse_features = ['C' + str(i) for i in range(1, 27)] dense_features = ['I' + str(i) for i in range(1, 14)] features = sparse_features + dense_features data_df[sparse_features] = data_df[sparse_features].fillna('-1') data_df[dense_features] = data_df[dense_features].fillna(0) # Bin continuous data into intervals. est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform') data_df[dense_features] = est.fit_transform(data_df[dense_features]) for feat in sparse_features: le = LabelEncoder() data_df[feat] = le.fit_transform(data_df[feat]) # ==============Feature Engineering=================== # ==================================================== feature_columns = [ sparseFeature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim) for feat in features ] train, test = train_test_split(data_df, test_size=test_size) train_X = train[features].values.astype('int32') train_y = train['label'].values.astype('int32') test_X = test[features].values.astype('int32') test_y = test['label'].values.astype('int32') return feature_columns, (train_X, train_y), (test_X, test_y)
def __init__(self, n_bins=15): self.n_bins = n_bins self.binarizer = KBinsDiscretizer(n_bins=self.n_bins, encode='onehot-dense')
def test_invalid_strategy_option(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], strategy='invalid-strategy') assert_raise_message( ValueError, "Valid options for 'strategy' are " "('uniform', 'quantile', 'kmeans'). " "Got strategy='invalid-strategy' instead.", est.fit, X)
def test_fit_transform(strategy, expected): est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy) est.fit(X) assert_array_equal(expected, est.transform(X))
def get_new_base_enc(): return [ KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile') for _ in range(LatLongScalarEnc.cont_dim) ]
def test_invalid_encode_option(): est = KBinsDiscretizer(n_bins=[2, 3, 3, 3], encode='invalid-encode') assert_raise_message( ValueError, "Valid options for 'encode' are " "('onehot', 'onehot-dense', 'ordinal'). " "Got encode='invalid-encode' instead.", est.fit, X)
def get_new_base_enc(): return KBinsDiscretizer(n_bins=8, encode='ordinal', strategy='quantile')
# Selecionando as Regiões únicas em ordem alfabética regioes = countries['Region'].sort_values().unique() regioes = list(regioes) regioes # # Análise de Dados: Questão 2 # In[ ]: from sklearn.preprocessing import KBinsDiscretizer # In[31]: # Aplicando o KBinsDiscretizer discretizer = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="quantile") discretizer.fit(countries[['Pop_density']]) # Obtendo um array com os dados transformados score_bins = discretizer.transform(countries[["Pop_density"]]) score_bins # In[32]: # Encontrando o percentil 90 q_90 = np.quantile(score_bins, 0.9) q_90 # In[33]: #Contando quantos valores estão acima do percentil 90
def test_invalid_n_bins_array(): # Bad shape n_bins = np.full((2, 4), 2.) est = KBinsDiscretizer(n_bins=n_bins) err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) # Incorrect number of features n_bins = [1, 2, 2] est = KBinsDiscretizer(n_bins=n_bins) err_msg = r"n_bins must be a scalar or array of shape \(n_features,\)." with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) # Bad bin values n_bins = [1, 2, 2, 1] est = KBinsDiscretizer(n_bins=n_bins) err_msg = ("KBinsDiscretizer received an invalid number of bins " "at indices 0, 3. Number of bins must be at least 2, " "and must be an int.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X) # Float bin values n_bins = [2.1, 2, 2.1, 2] est = KBinsDiscretizer(n_bins=n_bins) err_msg = ("KBinsDiscretizer received an invalid number of bins " "at indices 0, 2. Number of bins must be at least 2, " "and must be an int.") with pytest.raises(ValueError, match=err_msg): est.fit_transform(X)
print(bin_cols) #Ahora si pasamos al preprocesamiento del dataset si_cat_step = ('si1', SimpleImputer(strategy='constant', fill_value='MISSING')) ohe_cat_step = ('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore')) cat_steps = [si_cat_step, ohe_cat_step] cat_pipe = Pipeline(cat_steps) si_num_step = ('si2', SimpleImputer(strategy='mean')) ss_num_step = ('ss', StandardScaler()) num_steps = [si_num_step, ss_num_step] num_pipe = Pipeline(num_steps) si_bin_step = ('si3', SimpleImputer(strategy='median')) kb_bin_step = ('kb', KBinsDiscretizer(encode='onehot-dense')) bin_steps = [si_bin_step, kb_bin_step] bin_pipe = Pipeline(bin_steps) transformers = [('cat', cat_pipe, cat_cols), ('num', num_pipe, num_cols), ('bin', bin_pipe, bin_cols)] ct = ColumnTransformer(transformers=transformers) Z = ct.fit_transform(X) print(Z.shape) """ Ahora que tenemos nuestro dataset pre-procesado realizaremos 2 algoritmos de ML para nuestra regresión: 1) Red Neuronal 2) Random Forest. En ambos casos el tuneo de hiperparámetros se realizará a través de las herramientas de Grid Search y Cross Validation. Sobre los modelos con mejores parámetros se tomará la métrica de R2 para determinar que modelo predice mejor el salario de los empleados. """
def test_valid_n_bins(): KBinsDiscretizer(n_bins=2).fit_transform(X) KBinsDiscretizer(n_bins=np.array([2])[0]).fit_transform(X) assert KBinsDiscretizer(n_bins=2).fit(X).n_bins_.dtype == np.dtype(np.int)
import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.preprocessing import KBinsDiscretizer from sklearn.tree import DecisionTreeRegressor print(__doc__) # construct the dataset rnd = np.random.RandomState(42) X = rnd.uniform(-3, 3, size=100) y = np.sin(X) + rnd.normal(size=len(X)) / 3 X = X.reshape(-1, 1) # transform the dataset with KBinsDiscretizer enc = KBinsDiscretizer(n_bins=10, encode='onehot') X_binned = enc.fit_transform(X) # predict with original dataset fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4)) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) reg = LinearRegression().fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='green', label="linear regression") reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2,
def test_fit_transform(strategy, expected): est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy) est.fit(X) assert_array_equal(expected, est.transform(X))
def eval_cv(ims_save_name, cnn_save_name, features, load_ECG, ims_loaded_vars, cnn_loaded_vars, ims_save_dir, cnn_save_dir, device, conf_thresh, k, use_svm, norm, prune): params = ims_loaded_vars["params"] seed = params.seed np.random.seed(seed) n_splits = params.cv_splits n_repeats = params.cv_repeats cnn_params = cnn_loaded_vars["params"] use_norm = True if hasattr(cnn_params, "use_norm") and cnn_params.use_norm else False batch_size = cnn_params.batch_size print("{:>40} {:d}".format("Cross validation splits:", n_splits)) print("{:>40} {:d}".format("Cross validation repeats:", n_repeats)) ims_x = features[:, :13] ims_y = features[:, 13:15] raw_x = load_ECG['raw_x'] target = torch.tensor(load_ECG['target']) fft_x0 = scipy.fftpack.fft(raw_x[:, 0].numpy()) fft_x0 = np.abs(fft_x0[:, :raw_x.shape[2] // 2]) fft_x1 = scipy.fftpack.fft(raw_x[:, 1].numpy()) fft_x1 = np.abs(fft_x1[:, :raw_x.shape[2] // 2]) nf1 = np.mean(fft_x0, axis=-1) nf2 = np.mean(fft_x1, axis=-1) nf3 = np.max(raw_x[:, 0].numpy(), axis=-1) # / 11 nf4 = np.max(raw_x[:, 1].numpy(), axis=-1) # / 11 nf5 = np.min(raw_x[:, 0].numpy(), axis=-1) # / 11 nf6 = np.min(raw_x[:, 1].numpy(), axis=-1) # / 15 ims_x = np.append(ims_x, np.transpose([nf1, nf2, nf3, nf4, nf5, nf6]), axis=1) # ims_x = ims_x[:,[0, 2, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18]] # stable ims_x = ims_x[:, [0, 2, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17]] # mid, last - 512 # plt.plot(fft_x[802]) # plt.scatter(np.mean(fft_x0, axis=-1), np.mean(fft_x1, axis=-1), c=target) # plt.show() # exit() assert (ims_y[:, 1] == target.numpy()).all() data_tag = load_ECG['data_tag'] raw_feat = raw_x.shape[1] raw_size = raw_x.shape[2] num_classes = len(np.unique(target)) # rel_y = predict_reliability(ims_x, ims_y[:,1], k) rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=seed) ims_tp = np.zeros(n_splits * n_repeats) ims_fp = np.zeros(n_splits * n_repeats) ims_acc = np.zeros(n_splits * n_repeats) cnn_tp = np.zeros_like(ims_tp) cnn_fp = np.zeros_like(ims_fp) cnn_acc = np.zeros_like(ims_acc) nums_total = np.zeros_like(ims_tp) nums_pos = np.zeros_like(nums_total) nums_neg = np.zeros_like(nums_total) nums_cnn = np.zeros_like(nums_total) nums_pos_cnn = np.zeros_like(nums_total) nums_neg_cnn = np.zeros_like(nums_total) conf = np.ones_like(nums_cnn) three_class = True use_pca = False use_tree = True rf_size = 10 rf_seed = 1 rf_depth = np.empty(0) rf_params = np.empty(0) # ims_x = ims_x[:,12] # ims_x = ims_x.reshape(-1,1) # ims_x = np.log(ims_x + 1) # pca = PCA() # ims_x = pca.fit_transform(ims_x) # df = pd.DataFrame(ims_x) # scatter_matrix(df, diagonal="kde", alpha=0.2, c=ims_y[:,1]) # plt.show() # # for feat in range(ims_x.shape[1]): # # plt.subplot(7, 2, feat + 1) # # plt.plot(ims_x[:,feat]) # # plt.show() # exit() # selector = SelectFromModel(RandomForestClassifier(random_state=rf_seed, n_estimators=rf_size, ccp_alpha=1.1e-4), threshold=-np.inf, max_features=13) # feat_ctr = Counter() maxint = 12800 #np.iinfo(np.uint32).max quant = KBinsDiscretizer(n_bins=maxint, encode="ordinal", strategy="kmeans") for cv_idx, (trn_idx, tst_idx) in enumerate(rskf.split(ims_x, ims_y[:, 1])): # trn_idx, val_idx = train_test_split(trn_idx, test_size=len(tst_idx), stratify=target[trn_idx], random_state=seed) val_idx = None cv_save = "{}{}".format(ims_save_name[:-1], cv_idx) x_trn = ims_x[trn_idx] y_trn = ims_y[trn_idx] # rel_trn = predict_reliability(x_trn, y_trn[:,1], k) x_tst = ims_x[tst_idx] y_tst = ims_y[tst_idx] if use_pca: pca = PCA() pca.fit(x_trn) x_trn = pca.transform(x_trn) x_tst = pca.transform(x_tst) if norm: m_trn = x_trn.mean(axis=0) v_trn = x_trn.std(axis=0) x_trn = (x_trn - m_trn) / v_trn x_tst = (x_tst - m_trn) / v_trn # rel_trn = predict_reliability_simplified(x_trn, y_trn[:,1], x_tst, k) if three_class: if use_tree: rel_trn = predict_3_class(x_trn, y_trn[:, 1], k) else: rel_trn = predict_3_class_simplified(x_trn, y_trn[:, 1], x_tst, k) else: rel_trn = y_trn[:, 1] nums_total[cv_idx] = len(tst_idx) nums_pos[cv_idx] = (y_tst[:, 1] == 1).sum() nums_neg[cv_idx] = (y_tst[:, 1] == 0).sum() if len(rel_trn) > 0: if use_svm: svm = train_svm(x_trn, rel_trn) rel_mask = svm.predict(x_tst).astype(bool) elif use_tree: # dt = tree.DecisionTreeClassifier(random_state=rf_seed, max_depth=7) selected_trn = x_trn selected_tst = x_tst # selected_trn = selector.fit_transform(x_trn, rel_trn) # selected_tst = selector.transform(x_tst) # feat_ctr.update(np.where(selector.get_support())[0]) # selected_trn = quant.fit_transform(selected_trn) # selected_tst = quant.transform(selected_tst) # selected_trn = (selected_trn * 100000000).astype(np.int32) # selected_tst = (selected_tst * 100000000).astype(np.int32) # print(selected_tst) # exit() # print(np.where(selector.get_support())) # print("before: {}, after: {}".format(x_tst.shape, selected_tst.shape)) dt = RandomForestClassifier(random_state=rf_seed, n_estimators=rf_size, ccp_alpha=4.0e-4, max_depth=30) dt.fit(selected_trn, rel_trn) # temp = dt.estimators_[0].tree_.threshold.astype(np.int32) # dt.estimators_[0].tree_.threshold[:] = temp internal = [[ estimator.tree_.feature, estimator.tree_.threshold, estimator.tree_.children_left, estimator.tree_.children_right, np.argmax(estimator.tree_.value[estimator.tree_.feature == -2][:, 0], axis=-1) ] for estimator in dt.estimators_] # tree_summary(dt.estimators_[0]) # print(internal) # print(dt.estimators_[0].tree_.children_right) # print(np.argmax(dt.estimators_[0].tree_.value[dt.estimators_[0].tree_.feature == -2][:,0], axis=-1)) # print(dt.estimators_[0].tree_.node_count) # print(help(sklearn.tree._tree.Tree)) # exit() # dump(internal, open("000_rf/3c_rf{}_nf_norm_k2_cv{}.p".format(rf_size, cv_idx), "wb")) rf_params = np.append( rf_params, np.sum([ estimator.tree_.node_count for estimator in dt.estimators_ ])) rf_depth = np.append(rf_depth, [ estimator.tree_.max_depth for estimator in dt.estimators_ ]) # rf_depth = dt.tree_.max_depth order = get_rf_order(dt, selected_trn, rel_trn, "pred") pred = predict_rf_sorted(dt, selected_tst, order) # pred = dt.predict(selected_tst) rel_mask = pred != 2 rel_trn = pred else: # rel_mask = predict_reliability(x_trn, rel_trn, k-1, x_tst=x_tst) if three_class: rel_mask = rel_trn != 2 else: knn = KNeighborsClassifier(n_neighbors=k) knn.fit(x_trn, rel_trn) rel_trn = knn.predict(x_tst) rel_mask = np.ones_like(rel_trn).astype(bool) x_rel = x_tst[rel_mask] y_rel = y_tst[rel_mask] if len(x_rel) > 0: # ims_tp[cv_idx], ims_fp[cv_idx], ims_acc[cv_idx], below_thresh = fraunhofer_test.evaluate(x_rel, y_rel, cv_save, ims_save_dir, conf_thresh=conf_thresh, print_results=False) ims_tp[cv_idx] = ((rel_trn == y_tst[:, 1]) & (rel_trn == 1)).sum().item() ims_fp[cv_idx] = ((rel_trn != y_tst[:, 1]) & (rel_trn == 1)).sum().item() ims_acc[cv_idx] = ( rel_trn == y_tst[:, 1]).astype(int).sum().item() tst_idx = tst_idx[np.invert(rel_mask)] nums_cnn[cv_idx] = len(tst_idx) nums_pos_cnn[cv_idx] = (ims_y[tst_idx, 1] == 1).sum() nums_neg_cnn[cv_idx] = (ims_y[tst_idx, 1] == 0).sum() if len(tst_idx) == 0: continue jobs = 0 orig_device = None if device.type == "cuda": gpu_mem = torch.cuda.get_device_properties(device).total_memory data_size = sys.getsizeof(raw_x.storage()) + sys.getsizeof( target.storage()) if data_size >= gpu_mem * 0.85: # 85% of total memory, just a guess jobs = os.cpu_count() orig_device = device device = torch.device("cpu") ecg_datasets = create_datasets_cv(raw_x, target, trn_idx, val_idx, tst_idx, use_norm, device) trn_dl, val_dl, tst_dl = create_loaders(ecg_datasets, bs=batch_size, jobs=jobs) if orig_device: device = orig_device cv_save = "{}{}".format(cnn_save_name[:-1], cv_idx) model = torch.load(os.path.join(cnn_save_dir, "train_" + cv_save + '_best.pth'), map_location=device) if prune > 0: model = pruning.prune_fc(model, prune) (cnn_tp[cv_idx], _), (cnn_fp[cv_idx], _), cnn_acc[cv_idx], _, _ = evaluation.evaluate( model, tst_dl, tst_idx, data_tag, device=device, slide=False, print_results=False) # cnn_tp = nums_pos_cnn # cnn_fp = nums_neg_cnn # cnn_acc = nums_pos_cnn # flops, params = get_model_complexity_info(model, (raw_feat, raw_size), as_strings=False, print_per_layer_stat=False) # print("{:>40} {:.2f} seconds".format("Mean elapsed test time:", elapsed.mean())) nums_ims = nums_total - nums_cnn nums_pos_ims = nums_pos - nums_pos_cnn nums_neg_ims = nums_neg - nums_neg_cnn # # IMS-only # acc = ims_acc / nums_ims # tp = ims_tp / nums_pos_ims # fp = ims_fp / nums_neg_ims # # CNN-only # acc = cnn_acc / nums_cnn # tp = cnn_tp / nums_pos_cnn # fp = cnn_fp / nums_neg_cnn # Full acc = (ims_acc + cnn_acc) / nums_total tp = (ims_tp + cnn_tp) / nums_pos fp = (ims_fp + cnn_fp) / nums_neg conf = conf - (nums_cnn / nums_total) # print("{:>40} {:.2%}".format("Total data labeled as reliable:", rel_y.sum() / ims_y.shape[0])) # print("{:>40} {}".format("Best Features:", sorted([x[0] for x in feat_ctr.most_common(13)]))) if (nums_ims != nums_total).any(): print("{:>40} {:.2%}".format("Min IMS-net data:", conf.min())) print("{:>40} {:.2%}".format("Max IMS-net data:", conf.max())) print("{:>40} {:.2%}".format("Mean IMS-net data:", conf.mean())) print("{:>40} {:.2%}".format("IMS-net data standard deviation:", conf.std())) print("{:>40} {:.2%}".format("Min test accuracy:", acc.min())) print("{:>40} {:.2%}".format("Max test accuracy:", acc.max())) print("{:>40} {:.2%}".format("Mean test accuracy:", acc.mean())) print("{:>40} {:.2%}".format("Test accuracy standard deviation:", acc.std())) print("{:>40} {:.2%}".format("Min TP rate:", np.nanmin(tp))) print("{:>40} {:.2%}".format("Max TP rate:", np.nanmax(tp))) print("{:>40} {:.2%}".format("Mean TP rate:", np.nanmean(tp))) print("{:>40} {:.2%}".format("TP rate standard deviation:", np.nanstd(tp))) print("{:>40} {:.2%}".format("Min FP rate:", fp.min())) print("{:>40} {:.2%}".format("Max FP rate:", fp.max())) print("{:>40} {:.2%}".format("Mean FP rate:", fp.mean())) print("{:>40} {:.2%}".format("FP rate standard deviation:", fp.std())) if use_tree: print("{:>40} {:.0f}".format("Min RF params:", rf_params.min())) print("{:>40} {:.0f}".format("Max RF params:", rf_params.max())) print("{:>40} {:.2f}".format("Mean RF params:", rf_params.mean())) print("{:>40} {:.0f}".format("Min RF max_depth:", rf_depth.min())) print("{:>40} {:.0f}".format("Max RF max_depth:", rf_depth.max())) print("{:>40} {:.2f}".format("Mean RF max_depth:", rf_depth.mean())) print("{:>40} {}".format("Min TP > 90+std:", tp.min() > 0.9 + tp.std())) print("{:>40} {}".format("Mean TP > 90+4*std:", tp.mean() > 0.9 + (4 * tp.std()))) print("{:>40} {}".format("Max FP < 20-std:", fp.max() < 0.2 - tp.std())) print("{:>40} {}".format("Mean FP < 20-4*std:", fp.mean() < 0.2 - (4 * fp.std()))) # print('{:>40} {:d}'.format('Number of parameters:', params)) # print('{:>40} {:.0f}'.format('Computational complexity:', flops)) df = pd.DataFrame({ "Total-Acc": acc * 100, "Total-TP": tp * 100, "Total-FP": fp * 100, "IMS-Acc": ims_acc / nums_ims * 100, "IMS-TP": ims_tp / nums_pos_ims * 100, "IMS-FP": ims_fp / nums_neg_ims * 100, "CNN-Acc": cnn_acc / nums_cnn * 100, "CNN-TP": cnn_tp / nums_pos_cnn * 100, "CNN-FP": cnn_fp / nums_neg_cnn * 100 })
# %% [markdown] # We recall that a way of accelerating the gradient boosting is to reduce the # number of split considered within the tree building. One way is to bin the # data before to give them into the gradient boosting. A transformer called # `KBinsDiscretizer` is doing such transformation. Thus, we can pipeline # this preprocessing with the gradient boosting. # # We can first demonstrate the transformation done by the `KBinsDiscretizer`. # %% import numpy as np from sklearn.preprocessing import KBinsDiscretizer discretizer = KBinsDiscretizer(n_bins=256, encode="ordinal", strategy="quantile") data_trans = discretizer.fit_transform(data) data_trans # %% [markdown] # ```{note} # The code cell above will generate a couple of warnings. Indeed, for some of # the features, we requested too much bins in regard of the data dispersion # for those features. The smallest bins will be removed. # ``` # We see that the discretizer transforms the original data into an integer. # This integer represents the bin index when the distribution by quantile is # performed. We can check the number of bins per feature. # %%
def q2(): kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') pop_density_discrete = kbins.fit_transform( countries['Pop_density'].values.reshape(-1, 1)) return int((pop_density_discrete == 9).sum())
def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams)
df.drop_duplicates(inplace=True) df.replace(class_labels, [0, 1], inplace=True) negative_examples, positive_examples = np.bincount(df["income"]) split_ratio = args.train_test_split_ratio X_train, X_test, y_train, y_test = train_test_split(df.drop("income", axis=1), df["income"], test_size=split_ratio, random_state=0) preprocess = make_column_transformer( ( ["age", "num persons worked for employer"], KBinsDiscretizer(encode="onehot-dense", n_bins=10), ), (["capital gains", "capital losses", "dividends from stocks" ], StandardScaler()), (["education", "major industry code", "class of worker" ], OneHotEncoder(sparse=False)), ) train_features = preprocess.fit_transform(X_train) test_features = preprocess.transform(X_test) train_features_output_path = os.path.join("/opt/ml/processing/train", "train_features.csv") train_labels_output_path = os.path.join("/opt/ml/processing/train", "train_labels.csv") test_features_output_path = os.path.join("/opt/ml/processing/test",
# ## Questão 2 # # Discretizando a variável `Pop_density` em 10 intervalos com `KBinsDiscretizer`, seguindo o encode `ordinal` e estratégia `quantile`, quantos países se encontram acima do 90º percentil? Responda como um único escalar inteiro. # In[93]: from sklearn.preprocessing import (OneHotEncoder, Binarizer, KBinsDiscretizer, MinMaxScaler, StandardScaler, PolynomialFeatures) # In[94]: discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') discretizer.fit(countries[["Pop_density"]]) # In[95]: discretizer.bin_edges_[0] # In[102]: def q2(): return countries["Pop_density"][ countries.Pop_density >= discretizer.bin_edges_[0][9]].count()
q1() # ## Questão 2 # # Discretizando a variável `Pop_density` em 10 intervalos com `KBinsDiscretizer`, seguindo o encode `ordinal` e estratégia `quantile`, quantos países se encontram acima do 90º percentil? Responda como um único escalar inteiro. # In[16]: countries.head() # In[17]: from sklearn.preprocessing import KBinsDiscretizer discretizador = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile') discretizador.fit(countries[['Pop_density']]) popDensity_disc = discretizador.transform(countries[['Pop_density']]) popDensity_disc # In[18]: np.unique(popDensity_disc.flatten()) # In[19]: pd.Series(popDensity_disc.flatten()).value_counts()[9.0]
def preprocess_dataset(df, pipeline_path, load=False) -> (np.ndarray, np.ndarray): if load: with open(pipeline_path, "rb") as file: components = pickle.load(file) # 1) reply = df["reply_timestamp"].notnull().astype(int).to_numpy() retweet = df["retweet_timestamp"].notnull().astype(int).to_numpy() retweet_with_comment = df["retweet_with_comment_timestamp"].notnull( ).astype(int).to_numpy() like = df["like_timestamp"].notnull().astype(int).to_numpy() response = np.column_stack((reply, retweet, retweet_with_comment, like)) # 2) if load: language = components['language_encoder'].transform( df["language"].to_numpy().reshape(-1, 1)) tweet_type = components["tweet_type_encoder"].transform( df["tweet_type"].to_numpy().reshape(-1, 1)) present_media = components["present_media_encoder"].transform( df["present_media"]) else: language_encoder = OneHotEncoder() language = language_encoder.fit_transform( df["language"].to_numpy().reshape(-1, 1)) tweet_type_encoder = OneHotEncoder() tweet_type = tweet_type_encoder.fit_transform( df["tweet_type"].to_numpy().reshape(-1, 1)) present_media_encoder = MultiLabelBinarizer(sparse_output=False) present_media = present_media_encoder.fit_transform( df["present_media"]) tweet_features = sp.hstack([language, tweet_type, present_media]) #3) if load: text_tokens = components["text_tfidf"].transform(df['text_tokens']) else: text_tfidf = TfidfVectorizer() text_tokens = text_tfidf.fit_transform(df['text_tokens']) #4) if load: hashtags = components["hashtags_tfidf"].transform(df['hashtags']) else: hashtags_tfidf = TfidfVectorizer() hashtags = hashtags_tfidf.fit_transform(df['hashtags']) tweet_features = sp.hstack((text_tokens, hashtags)) # NOT np.vstack # 5) if load: df['tweet_id'] = df["tweet_id"].map(hash) df['engaged_with_user_id'] = df["engaged_with_user_id"].map(hash) df['engaging_user_id'] = df["engaging_user_id"].map(hash) tweet_id = components["tweet_discretizer"].transform( df['tweet_id'].to_numpy().reshape(-1, 1)) engaged_with_user_id = components[ "engaged_with_user_discretizer"].transform( df['engaged_with_user_id'].to_numpy().reshape(-1, 1)) engaging_user_id = components["engaging_user_discretizer"].transform( df['engaging_user_id'].to_numpy().reshape(-1, 1)) else: df['tweet_id'] = df["tweet_id"].map(hash) df['engaged_with_user_id'] = df["engaged_with_user_id"].map(hash) df['engaging_user_id'] = df["engaging_user_id"].map(hash) tweet_discretizer = KBinsDiscretizer(n_bins=50) tweet_id = tweet_discretizer.fit_transform( df['tweet_id'].to_numpy().reshape(-1, 1)) engaged_with_user_discretizer = KBinsDiscretizer(n_bins=50) engaged_with_user_id = tweet_discretizer.fit_transform( df['engaged_with_user_id'].to_numpy().reshape(-1, 1)) engaging_user_discretizer = KBinsDiscretizer(n_bins=50) engaging_user_id = tweet_discretizer.fit_transform( df['engaging_user_id'].to_numpy().reshape(-1, 1)) id_features = sp.hstack([tweet_id, engaged_with_user_id, engaging_user_id]) # 6) engaged_with_user_is_verified = df["engaged_with_user_is_verified"].astype( int).to_numpy() engaging_user_is_verified = df["engaging_user_is_verified"].astype( int).to_numpy() engaged_follows_engaging = df["engaged_follows_engaging"].astype( int).to_numpy() boolean_features = np.column_stack([ engaged_with_user_is_verified, engaging_user_is_verified, engaged_follows_engaging ]) # 7) present_links = df["present_links"].notnull().astype(int).to_numpy() present_domains = df["present_domains"].notnull().astype(int).to_numpy() present_features = np.column_stack([present_links, present_domains]) X_train = sp.hstack( [tweet_features, id_features, boolean_features, present_features]) if not load: components = { "language_encoder": language_encoder, "tweet_type_encoder": tweet_type_encoder, "present_media_encoder": present_media_encoder, "text_tfidf": text_tfidf, "hashtags_tfidf": hashtags_tfidf, "tweet_discretizer": tweet_id, "engaged_with_user_discretizer": engaged_with_user_discretizer, "engaging_user_discretizer": engaging_user_discretizer } with open(pipeline_path, "wb") as file: pickle.dump(components, file) return X_train, response
def test_inverse_transform(strategy, encode, expected_inv): kbd = KBinsDiscretizer(n_bins=3, strategy=strategy, encode=encode) Xt = kbd.fit_transform(X) Xinv = kbd.inverse_transform(Xt) assert_array_almost_equal(expected_inv, Xinv)
# Preprocess data # ########################################## if norm_target == 1: #Target normalization for continuous values target_np = scale(target_np) if norm_features == 1: #Feature normalization for continuous values data_np = scale(data_np) if binning == 1: #Discretize Target variable with KBinsDiscretizer enc = KBinsDiscretizer( n_bins=[bin_cnt], encode='ordinal', strategy='quantile' ) #Strategy here is important, quantile creating equal bins, but kmeans prob being more valid "clusters" target_np_bin = enc.fit_transform(target_np.reshape(-1, 1)) #Get Bin min/max temp = [[] for x in range(bin_cnt + 1)] for i in range(len(target_np)): for j in range(bin_cnt): if target_np_bin[i] == j: temp[j].append(target_np[i]) for j in range(bin_cnt): print('Bin', j, ':', min(temp[j]), max(temp[j]), len(temp[j])) print('\n') #Convert Target array back to correct shape
def transform_amounts(amounts: List[float], discretizer: KBinsDiscretizer) -> List[str]: amounts = discretizer.transform([[x] for x in amounts]) # unpack and covert float -> int -> str amounts = list(map(str, (map(int, chain(*amounts))))) return amounts
name = estimator.__class__.__name__ if name == 'Pipeline': name = [get_name(est[1]) for est in estimator.steps] name = ' + '.join(name) return name # list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ (LogisticRegression(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (LinearSVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }), (make_pipeline(KBinsDiscretizer(encode='onehot'), LogisticRegression(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'logisticregression__C': np.logspace(-2, 7, 10), }), (make_pipeline(KBinsDiscretizer(encode='onehot'), LinearSVC(random_state=0)), { 'kbinsdiscretizer__n_bins': np.arange(2, 10), 'linearsvc__C': np.logspace(-2, 7, 10), }), (GradientBoostingClassifier(n_estimators=50, random_state=0), { 'learning_rate': np.logspace(-4, 0, 10) }), (SVC(random_state=0), { 'C': np.logspace(-2, 7, 10) }),
def test_invalid_n_features(): est = KBinsDiscretizer(n_bins=3).fit(X) bad_X = np.arange(25).reshape(5, -1) assert_raise_message( ValueError, "Incorrect number of features. Expecting 4, " "received 5", est.transform, bad_X)
data_2 = data.copy() from sklearn.preprocessing import Binarizer X = data_2.iloc[:,0].values.reshape(-1,1) #类为特征专用,所以不能使用一维数组 transformer = Binarizer(threshold=30).fit_transform(X) data_2.iloc[:,0] = transformer data_2.head() # In[]: # 分箱 编码 一同完成: from sklearn.preprocessing import KBinsDiscretizer X = data.iloc[:,0].values.reshape(-1,1) # 普通转换、等宽 est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') # 等宽分箱 res = est.fit_transform(X) # 查看转换后分的箱:变成了一列中的三箱 print(set(res.ravel())) unique_label, counts_label = np.unique(res, return_counts=True) print(counts_label/ len(res)) # 普通转换、等位/等深 est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile') # 等位/深分箱 res = est.fit_transform(X) # 查看转换后分的箱:变成了一列中的三箱 print(set(res.ravel())) unique_label, counts_label = np.unique(res, return_counts=True) print(counts_label/ len(res)) # one-hot转换 默认
import pandas as pd from sklearn.preprocessing import KBinsDiscretizer data = pd.read_csv( r"E:\python_workspace\python_scripts\data\feature\Narrativedata.csv", index_col=0) data.loc[:, "Age"] = data.loc[:, "Age"].fillna(data.loc[:, "Age"].median()) data2 = data.copy() data2.iloc[:, 0].fillna(0) print(data2.iloc[:, 0]) X = data2.iloc[:, 0].values.reshape(-1, 1) est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform') res = est.fit_transform(X) print(res) est2 = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform') print(est2.fit_transform(X).toarray())
import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.preprocessing import KBinsDiscretizer from sklearn.tree import DecisionTreeRegressor print(__doc__) # construct the dataset rnd = np.random.RandomState(42) X = rnd.uniform(-3, 3, size=100) y = np.sin(X) + rnd.normal(size=len(X)) / 3 X = X.reshape(-1, 1) # transform the dataset with KBinsDiscretizer enc = KBinsDiscretizer(n_bins=10, encode='onehot') X_binned = enc.fit_transform(X) # predict with original dataset fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4)) line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1) reg = LinearRegression().fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='green', label="linear regression") reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y) ax1.plot(line, reg.predict(line), linewidth=2, color='red', label="decision tree") ax1.plot(X[:, 0], y, 'o', c='k') ax1.legend(loc="best") ax1.set_ylabel("Regression output") ax1.set_xlabel("Input feature")