Esempio n. 1
0
def test_one_hot_encoder_sparse():
    """Test OneHotEncoder's fit and transform."""
    X = [[3, 2, 1], [0, 1, 1]]
    enc = OneHotEncoder()
    # discover max values automatically
    X_trans = enc.fit_transform(X).toarray()
    assert_equal(X_trans.shape, (2, 5))
    assert_array_equal(enc.active_features_,
                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])

    # check outcome
    assert_array_equal(X_trans,
                       [[0., 1., 0., 1., 1.],
                        [1., 0., 1., 0., 1.]])

    # max value given as 3
    enc = OneHotEncoder(n_values=4)
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 4 * 3))
    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])

    # max value given per feature
    enc = OneHotEncoder(n_values=[3, 2, 2])
    X = [[1, 0, 1], [0, 1, 1]]
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
    assert_array_equal(enc.n_values_, [3, 2, 2])
    # check that testing with larger feature works:
    X = np.array([[2, 0, 1], [0, 1, 1]])
    enc.transform(X)

    # test that an error is raised when out of bounds:
    X_too_large = [[0, 2, 1], [0, 1, 1]]
    assert_raises(ValueError, enc.transform, X_too_large)
    assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)

    # test that error is raised when wrong number of features
    assert_raises(ValueError, enc.transform, X[:, :-1])
    # test that error is raised when wrong number of features in fit
    # with prespecified n_values
    assert_raises(ValueError, enc.fit, X[:, :-1])
    # test exception on wrong init param
    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)

    enc = OneHotEncoder()
    # test negative input to fit
    assert_raises(ValueError, enc.fit, [[0], [-1]])

    # test negative input to transform
    enc.fit([[0], [1]])
    assert_raises(ValueError, enc.transform, [[0], [-1]])
Esempio n. 2
0
def test_one_hot_encoder_sparse():
    """Test OneHotEncoder's fit and transform."""
    X = [[3, 2, 1], [0, 1, 1]]
    enc = OneHotEncoder()
    # discover max values automatically
    X_trans = enc.fit_transform(X).toarray()
    assert_equal(X_trans.shape, (2, 5))
    assert_array_equal(enc.active_features_,
                       np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
    assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])

    # check outcome
    assert_array_equal(X_trans,
                       [[0., 1., 0., 1., 1.],
                        [1., 0., 1., 0., 1.]])

    # max value given as 3
    enc = OneHotEncoder(n_values=4)
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 4 * 3))
    assert_array_equal(enc.feature_indices_, [0, 4, 8, 12])

    # max value given per feature
    enc = OneHotEncoder(n_values=[3, 2, 2])
    X = [[1, 0, 1], [0, 1, 1]]
    X_trans = enc.fit_transform(X)
    assert_equal(X_trans.shape, (2, 3 + 2 + 2))
    assert_array_equal(enc.n_values_, [3, 2, 2])
    # check that testing with larger feature works:
    X = np.array([[2, 0, 1], [0, 1, 1]])
    enc.transform(X)

    # test that an error is raised when out of bounds:
    X_too_large = [[0, 2, 1], [0, 1, 1]]
    assert_raises(ValueError, enc.transform, X_too_large)
    assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X)

    # test that error is raised when wrong number of features
    assert_raises(ValueError, enc.transform, X[:, :-1])
    # test that error is raised when wrong number of features in fit
    # with prespecified n_values
    assert_raises(ValueError, enc.fit, X[:, :-1])
    # test exception on wrong init param
    assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X)

    enc = OneHotEncoder()
    # test negative input to fit
    assert_raises(ValueError, enc.fit, [[0], [-1]])

    # test negative input to transform
    enc.fit([[0], [1]])
    assert_raises(ValueError, enc.transform, [[0], [-1]])
Esempio n. 3
0
def load_UCI_Credit_Card_data(infile=None, balanced=True, seed=5):

    X = []
    y = []
    sids = []

    with open(infile, "r") as fi:
        fi.readline()
        reader = csv.reader(fi)
        for row in reader:
            sids.append(row[0])
            X.append(row[1:-1])
            y0 = int(row[-1])
            if y0 == 0:
                y0 = -1
            y.append(y0)
    y = np.array(y)

    if balanced:
        X, y = balance_X_y(X, y, seed)

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)

    encoder = OneHotEncoder(categorical_features=[1, 2, 3])
    encoder.fit(X)
    X = encoder.transform(X).toarray()

    X, y = shuffle_X_y(X, y, seed)

    scale_model = StandardScaler()
    X = scale_model.fit_transform(X)

    return X, np.expand_dims(y, axis=1)
Esempio n. 4
0
    def train(self, X, Y, class_number=-1):
        class_count = max(np.unique(Y).size, class_number)
        feature_count = X.shape[1]
        self.__hpelm = ELM(feature_count, class_count, 'wc')
        self.__hpelm.add_neurons(feature_count, "sigm")

        Y_arr = Y.reshape(-1, 1)
        enc = OneHotEncoder()
        enc.fit(Y_arr)
        Y_OHE = enc.transform(Y_arr).toarray()

        out_fd = sys.stdout
        sys.stdout = open(os.devnull, 'w')
        self.__hpelm.train(X, Y_OHE)
        sys.stdout = out_fd
Esempio n. 5
0
	def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test):
		'''
		开始构建模型
		Args:
			X_train_d: 离散特征训练数据
			X_train_c: 连续特征训练数据
			X_test_d: 离散特征测试数据
			X_test_c: 连续特征测试数据
			y_train: 训练数据标记 {-1, 1}
			y_test: 测试数据标记 {-1, 1}
		Returns:
			gbc_enc: GBDT OneHotEncoder
			gbc: GBDT模型
			comb_model: 训练得到的组合模型
			threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例
			comb_model_auc: 模型AUC
			precision: 模型精度
			recall: 模型召回率
		'''
		if self._random_state is not None:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train)
		else:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train)
		X_train_leaves = gbc.apply(X_train_c)[:,:,0]
		X_test_leaves = gbc.apply(X_test_c)[:,:,0]
		(X_train_rows, cols) = X_train_leaves.shape
		gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d])
		X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d])
		log.debug("Combine features done.")
		comb_model = LogisticRegression().fit(X_train_ext, y_train)
		log.debug("Training done.")
		comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1]
		precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred)
		ap = average_precision_score(y_test, comb_model_pred)
		recall_meet = recall >= self._recall_rate
		recall_meet_min = len([item for item in recall_meet if item == True])
		threshold = thresholds[recall_meet_min-1]
		log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1])
		comb_model_auc = roc_auc_score(y_test, comb_model_pred)
		log.debug("AUC score is: %f", comb_model_auc)
		return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
Esempio n. 6
0
def test_one_hot_encoder_unknown_transform():
    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
    y = np.array([[4, 1, 1]])

    # Test that one hot encoder raises error for unknown features
    # present during transform.
    oh = OneHotEncoder(handle_unknown='error')
    oh.fit(X)
    assert_raises(ValueError, oh.transform, y)

    # Test the ignore option, ignores unknown features.
    oh = OneHotEncoder(handle_unknown='ignore')
    oh.fit(X)
    assert_array_equal(
        oh.transform(y).toarray(), np.array([[0., 0., 0., 0., 1., 0., 0.]]))

    # Raise error if handle_unknown is neither ignore or error.
    oh = OneHotEncoder(handle_unknown='42')
    oh.fit(X)
    assert_raises(ValueError, oh.transform, y)
Esempio n. 7
0
def test_one_hot_encoder_unknown_transform():
    X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
    y = np.array([[4, 1, 1]])

    # Test that one hot encoder raises error for unknown features
    # present during transform.
    oh = OneHotEncoder(handle_unknown='error')
    oh.fit(X)
    assert_raises(ValueError, oh.transform, y)

    # Test the ignore option, ignores unknown features.
    oh = OneHotEncoder(handle_unknown='ignore')
    oh.fit(X)
    assert_array_equal(
        oh.transform(y).toarray(),
        np.array([[ 0.,  0.,  0.,  0.,  1.,  0.,  0.]])
        )

    # Raise error if handle_unknown is neither ignore or error.
    oh = OneHotEncoder(handle_unknown='42')
    oh.fit(X)
    assert_raises(ValueError, oh.transform, y)
Esempio n. 8
0
def _run_one_hot(X, X2, cat):
    enc = OneHotEncoder(categorical_features=cat)
    Xtr = enc.fit_transform(X)
    X2tr = enc.transform(X2)
    return Xtr, X2tr
Esempio n. 9
0
def _run_one_hot(X, X2, cat):
    enc = OneHotEncoder(categorical_features=cat)
    Xtr = enc.fit_transform(X)
    X2tr = enc.transform(X2)
    return Xtr, X2tr