def check_transformer_pickle(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): return set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) assert_array_almost_equal(pickled_X_pred, X_pred)
def check_classifiers_classes(name, Classifier): X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: if name in ["LabelPropagation", "LabelSpreading"]: # TODO some complication with -1 label y_ = y else: y_ = y_names classes = np.unique(y_) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() if name == 'BernoulliNB': classifier.set_params(binarize=X.mean()) set_fast_parameters(classifier) # fit classifier.fit(X, y_) y_pred = classifier.predict(X) # training set performance assert_array_equal(np.unique(y_), np.unique(y_pred)) if np.any(classifier.classes_ != classes): print("Unexpected classes_ attribute for %r: " "expected %s, got %s" % (classifier, classes, classifier.classes_))
def test_thresholded_scorers(): """Test scorers that take thresholds.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = SCORERS['log_loss'](clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = SCORERS['roc_auc'](clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, SCORERS['roc_auc'], clf, X_test, y_test)
def test_fit_uuu(): n_samples1 = 10000 n_features = 5 centers1 = np.array([[10, 5, 1, -5, -10], [-10, -5, -1, 5, 10]]) cluster_std1 = np.array([[1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]]) X1, y1 = make_blobs(n_features=n_features, n_samples=n_samples1, centers=centers1, cluster_std=cluster_std1) n_samples2 = 5000 centers2 = np.array([[10, 5, 1, -5, -10]]) cluster_std2 = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]]) X2, y2 = make_blobs(n_features=n_features, n_samples=n_samples2, centers=centers2, cluster_std=cluster_std2) X = np.vstack((X1, X2)) model = mixture.PGMM(covariance_type='UUU', n_components=2, n_pc=3) model.fit(X) assert_array_almost_equal(np.sum(model.means_, 0), np.sum(centers1, 0), decimal=0) assert_array_almost_equal(np.sort(model.weights_), np.array([0.333, 0.666]), decimal=1) assert_equal(model.means_.shape, np.array([2, n_features])) assert_equal(model.weights_.shape, np.array([2])) assert_equal(model.noise_.shape, np.array([2, n_features])) assert_equal(model.principal_subspace_.shape, np.array([2, n_features, 3])) assert_equal(model.covars_.shape, np.array([2, n_features, n_features])) logging.info('TestFitUUU: OK')
def test_check_is_fitted(): # Check is ValueError raised when non estimator instance passed assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_") assert_raises(TypeError, check_is_fitted, "SVR", "support_") ard = ARDRegression() svr = SVR() try: assert_raises(NotFittedError, check_is_fitted, ard, "coef_") assert_raises(NotFittedError, check_is_fitted, svr, "support_") except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s") except ValueError as e: assert_equal(str(e), "Random message ARDRegression, ARDRegression") try: check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s") except AttributeError as e: assert_equal(str(e), "Another message SVR, SVR") ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert_equal(None, check_is_fitted(ard, "coef_")) assert_equal(None, check_is_fitted(svr, "support_"))
def show_dbscan(): """ simulate 1 month of normal hourly room percentage data followed by an anomalous percentage the normal data is bimodal following most peoples activity patterns in which there is routinely a weekday percentage and a weekend percentage. 1 day in which the person spends a large amount of time in the bathroom is simulated """ # simulate normal hourly data weekday = ([0.05, 0.95], 0.05) #bath, bed weekend = ([0.3, 0.7], 0.1) roomperwd, truelabelswd = make_blobs(n_samples=23, centers=weekday[0], cluster_std=weekday[1], random_state=0) roomperwe, truelabelswe = make_blobs(n_samples=8, centers=weekend[0], cluster_std=weekend[1], random_state=0) # combine modes roompers = np.vstack((roomperwd, roomperwe)) # make positive and sum to one to simulate valid distribution for i in range(roompers.shape[0]): for j in range(roompers.shape[1]): if roompers[i, j] < 0: roompers[i, j] = 0 roompersnorm = normalize(roompers, norm='l1') # simulate anomaly on most recent day where don't leave bedroom roompersnorm[-1, :] = np.array([0.8, 0.2]) # detect outliers roompersdetector = HourlyRoomPercentageAnomalyDetection(roompersnorm, eps=0.3, min_samples=3) labels = roompersdetector.scale_and_proximity_cluster(eps=0.3, min_samples=3) # plot results plt.figure() seenflag1 = False; seenflag2 = False; seenflag3 = False; for i, label in enumerate(labels): if label == 0: if seenflag1: plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'ro') else: plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'ro', label='Cluster 1') seenflag1 = True elif label == 1: if seenflag2: plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'kx') else: plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'kx', label='Cluster 2') seenflag2 = True elif label == -1: if seenflag3: plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'b^') else: plt.plot(roompersnorm[i][0], roompersnorm[i][1], 'b^', label='Outlier') seenflag3 = True plt.legend(loc='lower left') plt.axis([0, 1, 0, 1]) plt.show()
def test_calibration_multiclass(): """Test calibration for multiclass """ # test multi-class setting with classifier that implements # only decision function clf = LinearSVC() X, y_idx = make_blobs(n_samples=100, n_features=2, random_state=42, centers=3, cluster_std=3.0) # Use categorical labels to check that CalibratedClassifierCV supports # them correctly target_names = np.array(['a', 'b', 'c']) y = target_names[y_idx] X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf.fit(X_train, y_train) for method in ['isotonic', 'sigmoid']: cal_clf = CalibratedClassifierCV(clf, method=method, cv=2) cal_clf.fit(X_train, y_train) probas = cal_clf.predict_proba(X_test) assert_array_almost_equal(np.sum(probas, axis=1), np.ones(len(X_test))) # Check that log-loss of calibrated classifier is smaller than # log-loss of naively turned OvR decision function to probabilities # via softmax def softmax(y_pred): e = np.exp(-y_pred) return e / e.sum(axis=1).reshape(-1, 1) uncalibrated_log_loss = \ log_loss(y_test, softmax(clf.decision_function(X_test))) calibrated_log_loss = log_loss(y_test, probas) assert_greater_equal(uncalibrated_log_loss, calibrated_log_loss) # Test that calibration of a multiclass classifier decreases log-loss # for RandomForestClassifier X, y = make_blobs(n_samples=100, n_features=2, random_state=42, cluster_std=3.0) X_train, y_train = X[::2], y[::2] X_test, y_test = X[1::2], y[1::2] clf = RandomForestClassifier(n_estimators=10, random_state=42) clf.fit(X_train, y_train) clf_probs = clf.predict_proba(X_test) loss = log_loss(y_test, clf_probs) for method in ['isotonic', 'sigmoid']: cal_clf = CalibratedClassifierCV(clf, method=method, cv=3) cal_clf.fit(X_train, y_train) cal_clf_probs = cal_clf.predict_proba(X_test) cal_loss = log_loss(y_test, cal_clf_probs) assert_greater(loss, cal_loss)
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('neg_log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) with pytest.raises(ValueError, match="multiclass format is not supported"): get_scorer('roc_auc')(clf, X_test, y_test) # test error is raised with a single class present in model # (predict_proba shape is not suitable for binary auc) X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = DecisionTreeClassifier() clf.fit(X_train, np.zeros_like(y_train)) with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('roc_auc')(clf, X_test, y_test) # for proba scorers with pytest.raises(ValueError, match="need classifier with two classes"): get_scorer('neg_log_loss')(clf, X_test, y_test)
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def test_compute_class_weight_invariance(): # Test that results with class_weight="balanced" is invariant wrt # class imbalance if the number of samples is identical. # The test uses a balanced two class dataset with 100 datapoints. # It creates three versions, one where class 1 is duplicated # resulting in 150 points of class 1 and 50 of class 0, # one where there are 50 points in class 1 and 150 in class 0, # and one where there are 100 points of each class (this one is balanced # again). # With balancing class weights, all three should give the same model. X, y = make_blobs(centers=2, random_state=0) # create dataset where class 1 is duplicated twice X_1 = np.vstack([X] + [X[y == 1]] * 2) y_1 = np.hstack([y] + [y[y == 1]] * 2) # create dataset where class 0 is duplicated twice X_0 = np.vstack([X] + [X[y == 0]] * 2) y_0 = np.hstack([y] + [y[y == 0]] * 2) # duplicate everything X_ = np.vstack([X] * 2) y_ = np.hstack([y] * 2) # results should be identical logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1) logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0) logreg = LogisticRegression(class_weight="balanced").fit(X_, y_) assert_array_almost_equal(logreg1.coef_, logreg0.coef_) assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def check_pipeline_consistency(name, Estimator): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) # check that make_pipeline(est) gives same score as est X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) pipeline = make_pipeline(estimator) estimator.fit(X, y) pipeline.fit(X, y) funcs = ["score", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func_pipeline = getattr(pipeline, func_name) result = func(X, y) result_pipe = func_pipeline(X, y) assert_array_almost_equal(result, result_pipe)
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA': # FIXME # for MiniBatchDictLearning and MiniBatchSparsePCA estimator.batch_size = 1 set_fast_parameters(estimator) set_random_state(estimator) params = estimator.get_params() estimator.fit(X, y) new_params = estimator.get_params() for k, v in params.items(): assert_false(np.any(new_params[k] != v), "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]))
def separable_demo(): """ Generate a linearly-separable dataset D, train a linear SVM on D, then output the resulting decision boundary on a figure. """ from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=200, n_features=2, centers=((0,0), (4, 4)), cluster_std=1.0) plot_data(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) svc = svm.SVC(class_weight='auto') param_grid = {'kernel': ['linear'], 'C': [1e0, 1e1, 1e2, 1e3, 1e4]} strat_2fold = StratifiedKFold(y_train, k=2) print " Parameters to be chosen through cross validation:" for name, vals in param_grid.iteritems(): if name != 'kernel': print " {0}: {1}".format(name, vals) clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold) clf.fit(X_train, y_train) print "== Best Parameters:", clf.best_params_ y_pred = clf.predict(X_test) acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred)) print "== Accuracy:", acc print classification_report(y_test, y_pred) plot_svm(clf.best_estimator_, X, y, X_test, y_test, title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
def check_class_weight_classifiers(name, Classifier): if name == "NuSVC": # the sparse version has a parameter that doesn't do anything raise SkipTest if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! raise SkipTest for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) n_centers = len(np.unique(y_train)) if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) if hasattr(classifier, "min_weight_fraction_leaf"): classifier.set_params(min_weight_fraction_leaf=0.01) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.89)
def test_transformers_data_not_an_array(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 for name, Transformer in transformers: # XXX: some transformers are transforming the input # data. This is a bug that we'll fix later. Right now we copy # the data each time this_X = NotAnArray(X.copy()) this_y = NotAnArray(np.asarray(y)) if name in dont_test: continue # these don't actually fit the data: if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: continue # And these wan't multivariate output if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): continue yield check_transformer, name, Transformer, this_X, this_y
def test_class_weight_classifiers(): # test that class_weight works and that the semantics are consistent classifiers = all_estimators(type_filter="classifier") with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.9)
def test_decision_function_shape_two_class(): for n_classes in [2, 3]: X, y = make_blobs(centers=n_classes, random_state=0) for estimator in [svm.SVC, svm.NuSVC]: clf = OneVsRestClassifier(estimator( decision_function_shape="ovr")).fit(X, y) assert_equal(len(clf.predict(X)), len(y))
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes classifiers = all_estimators(type_filter='classifier') X, y = make_blobs(random_state=12345) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 classes = np.unique(y) # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78, "accuracy of %s not greater than 0.78" % str(Clf)) assert_array_equal( clf.classes_, classes, "Unexpected classes_ attribute for %r" % clf)
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) # Make a physical copy of the orginal estimator parameters before fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model estimator.fit(X, y) # Compare the state of the model parameters with the original parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert_equal(hash(new_value), hash(original_value), "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (name, param_name, original_value, new_value))
def test_sag_pobj_matches_logistic_regression(): """tests if the sag pobj matches log reg""" n_samples = 100 alpha = 1.0 max_iter = 20 X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) clf1 = LogisticRegression(solver='sag', fit_intercept=False, tol=.0000001, C=1. / alpha / n_samples, max_iter=max_iter, random_state=10) clf2 = clone(clf1) clf3 = LogisticRegression(fit_intercept=False, tol=.0000001, C=1. / alpha / n_samples, max_iter=max_iter, random_state=10) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) clf3.fit(X, y) pobj1 = get_pobj(clf1.coef_, alpha, X, y, log_loss) pobj2 = get_pobj(clf2.coef_, alpha, X, y, log_loss) pobj3 = get_pobj(clf3.coef_, alpha, X, y, log_loss) assert_array_almost_equal(pobj1, pobj2, decimal=4) assert_array_almost_equal(pobj2, pobj3, decimal=4) assert_array_almost_equal(pobj3, pobj1, decimal=4)
def test_multiclass_classifier_class_weight(): """tests multiclass with classweights for each class""" alpha = .1 n_samples = 20 tol = .00001 max_iter = 50 class_weight = {0: .45, 1: .55, 2: .75} fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight) spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, sparse=True) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def test_grid_search_correct_score_results(): # test that correct scores are used n_splits = 3 clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score, cv=n_splits) results = grid_search.fit(X, y).cv_results_ # Test scorer names result_keys = list(results.keys()) expected_keys = (("mean_test_score", "rank_test_score") + tuple("split%d_test_score" % cv_i for cv_i in range(n_splits))) assert_true(all(in1d(expected_keys, result_keys))) cv = StratifiedKFold(n_splits=n_splits) n_splits = grid_search.n_splits_ for candidate_i, C in enumerate(Cs): clf.set_params(C=C) cv_scores = np.array( list(grid_search.cv_results_['split%d_test_score' % s][candidate_i] for s in range(n_splits))) for i, (train, test) in enumerate(cv.split(X, y)): clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, cv_scores[i])
def plot_scaling(): X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1) X += 3 plt.figure(figsize=(15, 8)) main_ax = plt.subplot2grid((2, 4), (0, 0), rowspan=2, colspan=2) main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm2, s=60) maxx = np.abs(X[:, 0]).max() maxy = np.abs(X[:, 1]).max() main_ax.set_xlim(-maxx + 1, maxx + 1) main_ax.set_ylim(-maxy + 1, maxy + 1) main_ax.set_title("Original Data") other_axes = [plt.subplot2grid((2, 4), (i, j)) for j in range(2, 4) for i in range(2)] for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(), MinMaxScaler(), Normalizer(norm='l2')]): X_ = scaler.fit_transform(X) ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap=cm2, s=60) ax.set_xlim(-2, 2) ax.set_ylim(-2, 2) ax.set_title(type(scaler).__name__) other_axes.append(main_ax) for ax in other_axes: ax.spines['left'].set_position('center') ax.spines['right'].set_color('none') ax.spines['bottom'].set_position('center') ax.spines['top'].set_color('none') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left')
def test_verbose_boolean(): # checks that the output for the verbose output is the same # for the flag values '1' and 'True' # simple 3 cluster dataset X, y = make_blobs(random_state=1) for Model in [DPGMM, VBGMM]: dpgmm_bool = Model(n_components=10, random_state=1, alpha=20, n_iter=50, verbose=True) dpgmm_int = Model(n_components=10, random_state=1, alpha=20, n_iter=50, verbose=1) old_stdout = sys.stdout sys.stdout = StringIO() try: # generate output with the boolean flag dpgmm_bool.fit(X) verbose_output = sys.stdout verbose_output.seek(0) bool_output = verbose_output.readline() # generate output with the int flag dpgmm_int.fit(X) verbose_output = sys.stdout verbose_output.seek(0) int_output = verbose_output.readline() assert_equal(bool_output, int_output) finally: sys.stdout = old_stdout
def test_decision_function_shape(): # check that decision_function_shape='ovr' gives # correct shape and is consistent with predict clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovr').fit(iris.data, iris.target) dec = clf.decision_function(iris.data) assert_equal(dec.shape, (len(iris.data), 3)) assert_array_equal(clf.predict(iris.data), np.argmax(dec, axis=1)) # with five classes: X, y = make_blobs(n_samples=80, centers=5, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovr').fit(X_train, y_train) dec = clf.decision_function(X_test) assert_equal(dec.shape, (len(X_test), 5)) assert_array_equal(clf.predict(X_test), np.argmax(dec, axis=1)) # check shape of ovo_decition_function=True clf = svm.SVC(kernel='linear', C=0.1, decision_function_shape='ovo').fit(X_train, y_train) dec = clf.decision_function(X_train) assert_equal(dec.shape, (len(X_train), 10)) # check deprecation warning clf = svm.SVC(kernel='linear', C=0.1).fit(X_train, y_train) msg = "change the shape of the decision function" dec = assert_warns_message(ChangedBehaviorWarning, msg, clf.decision_function, X_train) assert_equal(dec.shape, (len(X_train), 10))
def single_calc(n_sample): #n_sample = 1000 n_feature = 2 cluster_std = 0.5 center = 2 #for n_sample in [10, 50, 100, 500, 1000, 5000, 10000]: pts, labels = datasets.make_blobs(n_samples=n_sample, n_features=n_feature, cluster_std=cluster_std, centers=center) start = timer() tri = Tri(pts) end = timer() tri_time = end - start print(tri_time) #tri_res = compare_labels(labels, tri.labels) start = timer() auto = Autoclust(pts) end = timer() auto_time = end - start print(auto_time) #auto_res = compare_labels(labels, auto.labels) res_dict = {'tri': tri_time, 'auto': auto_time, 'samples': n_sample} with open('times', 'a') as f: print(res_dict, file=f)
def test_vbgmm_no_modify_alpha(): alpha = 2. n_components = 3 X, y = make_blobs(random_state=1) vbgmm = VBGMM(n_components=n_components, alpha=alpha, n_iter=1) assert_equal(vbgmm.alpha, alpha) assert_equal(vbgmm.fit(X).alpha_, float(alpha) / n_components)
def test_grid_search_iid(): # test the iid parameter # noise-free simple 2d-data X, y = make_blobs( centers=[[0, 0], [1, 0], [0, 1], [1, 1]], random_state=0, cluster_std=0.1, shuffle=False, n_samples=80 ) # split dataset into two folds that are not iid # first one contains data of all 4 blobs, second only from two. mask = np.ones(X.shape[0], dtype=np.bool) mask[np.where(y == 1)[0][::2]] = 0 mask[np.where(y == 2)[0][::2]] = 0 # this leads to perfect classification on one fold and a score of 1/3 on # the other svm = SVC(kernel="linear") # create "cv" for splits cv = [[mask, ~mask], [~mask, mask]] # once with iid=True (default) grid_search = GridSearchCV(svm, param_grid={"C": [1, 10]}, cv=cv) grid_search.fit(X, y) first = grid_search.grid_scores_[0] assert_equal(first.parameters["C"], 1) assert_array_almost_equal(first.cv_validation_scores, [1, 1.0 / 3.0]) # for first split, 1/4 of dataset is in test, for second 3/4. # take weighted average assert_almost_equal(first.mean_validation_score, 1 * 1.0 / 4.0 + 1.0 / 3.0 * 3.0 / 4.0) # once with iid=False grid_search = GridSearchCV(svm, param_grid={"C": [1, 10]}, cv=cv, iid=False) grid_search.fit(X, y) first = grid_search.grid_scores_[0] assert_equal(first.parameters["C"], 1) # scores are the same as above assert_array_almost_equal(first.cv_validation_scores, [1, 1.0 / 3.0]) # averaged score is just mean of scores assert_almost_equal(first.mean_validation_score, np.mean(first.cv_validation_scores))
def check_transformer_general(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) X -= X.min() _check_transformer(name, Transformer, X, y) _check_transformer(name, Transformer, X.tolist(), y.tolist())
def test_search_cv_results_rank_tie_breaking(): X, y = make_blobs(n_samples=50, random_state=42) # The two C values are close enough to give similar models # which would result in a tie of their mean cv-scores param_grid = {'C': [1, 1.001, 0.001]} grid_search = GridSearchCV(SVC(), param_grid=param_grid) random_search = RandomizedSearchCV(SVC(), n_iter=3, param_distributions=param_grid) for search in (grid_search, random_search): search.fit(X, y) results = search.cv_results_ # Check tie breaking strategy - # Check that there is a tie in the mean scores between # candidates 1 and 2 alone assert_almost_equal(results['mean_test_score'][0], results['mean_test_score'][1]) try: assert_almost_equal(results['mean_test_score'][1], results['mean_test_score'][2]) except AssertionError: pass # 'min' rank should be assigned to the tied candidates assert_almost_equal(search.cv_results_['rank_test_score'], [1, 1, 3])
#output1 = np.empty(len(x_input)) #output2 = np.empty(len(x_input)) #for i in range(len(x_input)): # a = A[1,1] # b = (A[1,0] + A[0,1])*x[i] + alpha[1] # c = A[0,0] * x[i]**2 + alpha[0] * x[i] + alpha0 # output1[i], output2[i] = solve_quadratic_eq(a,b,c) # #plt.figure() #plt.scatter(train["Height"], train["Weight"], c=y) #plt.plot(x,output1, label = "first sol") #plt.plot(x,output2, label = "second sol") #plt.legend() #########################################3 X, y = make_blobs(300, 2, centers=2, random_state=100) X[y == 0] = X[y == 0] + 2 X[y == 0] = np.dot(np.array([[0.5, 0.1], [1.2, 1.5]]), X[y == 0].T).T X = X - np.mean(X, axis=0) class1 = X[y == 0] class2 = X[y == 1] prior1 = len(class1) / len(X) prior2 = len(class2) / len(X) cov1 = np.cov(class1.T) cov2 = np.cov(class2.T) #cov1[1,0] = cov1[0,1] = cov2[0,1] = cov2[1,0] = 0 mean1 = np.mean(np.array(class1), axis=0) mean2 = np.mean(np.array(class2), axis=0) A = 0.5 * (np.linalg.inv(cov1) - np.linalg.inv(cov2))
**kwargs) if __name__ == '__main__': n_samples = 1000 random_state = 170 transformation = [[0.6, -0.6], [-0.4, 0.8]] models = [ { 'name': 'Far Blobs', 'X': datasets.make_blobs(n_samples=n_samples, centers=25, random_state=0, center_box=(-10000, 10000), cluster_std=50)[0], }, { 'name': 'Noisy Circles', 'X': datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)[0], }, { 'name': 'Noisy Moons', 'X': datasets.make_moons(n_samples=n_samples, noise=.05)[0], }, {
# # Grouping objects by similarity using k-means # ## K-means clustering using scikit-learn X, y = make_blobs(n_samples=150, n_features=2, centers=3, cluster_std=0.5, shuffle=True, random_state=0) plt.scatter(X[:, 0], X[:, 1], c='white', marker='o', edgecolor='black', s=50) plt.grid() plt.tight_layout() #plt.savefig('images/11_01.png', dpi=300) plt.title("some random data") plt.show()
plt.show() # 分类模型随机数据 # X1为样本特征,Y1为样本类别输出, 共400个样本,每个样本2个特征,输出有3个类别,没有冗余特征,每个类别一个簇 X1, Y1 = make_classification(n_samples=400, n_features=2, n_redundant=0, n_clusters_per_class=1, n_classes=3) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) plt.show() # 生成用于聚类的各向同性高斯blobs # X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共3个簇,簇中心在[-1,-1], [1,1], [2,2], 簇方差分别为[0.4, 0.5, 0.2] X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1, -1], [1, 1], [2, 2]], cluster_std=[0.4, 0.5, 0.2]) plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) plt.show() # 分组正态分布混合数据 # 生成2维正态分布,生成的数据按分位数分成3组,1000个样本,2个样本特征均值为1和2,协方差系数为2 X1, Y1 = make_gaussian_quantiles(n_samples=1000, n_features=2, n_classes=3, mean=[1, 2], cov=2) plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1) plt.show()
def mejora_semiboost(n=20, clf=SVC(probability=True), n_features=5, n_samples=1000, ratio_unsampled=0.5, data_simulation='make_classification', similarity_kernel='rbf'): ROC_semiboost = list() ROC_clf = list() for i in range(n): ''' SIMULATE SEMI SUPERVISED DATASET ''' if data_simulation == 'make_classification': X, y = make_classification(n_features=n_features, n_samples=n_samples, n_redundant=0, n_clusters_per_class=1) elif data_simulation == 'make_blobs': X, y = make_blobs(n_features=n_features, centers=2, n_samples=n_samples) elif data_simulation == 'make_gaussian_quantiles': X, y = make_gaussian_quantiles(n_features=n_features, n_classes=2, n_samples=n_samples) elif data_simulation == 'make_moons': X, y = make_moons(n_samples=n_samples) elif data_simulation == 'make_circles': X, y = make_circles(n_samples=n_samples) else: print('Unknown data simulation method') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) labels = np.copy(y_train) labels[labels == 0] = -1 # create some unlabeled data random_unlabeled_points = np.random.rand( len(y_train)) < ratio_unsampled labels[random_unlabeled_points] = 0 y_train = labels ''' SEMIBOOST SKLEARN STYLE ''' model = SemiBoost.SemiBoostClassifier(base_model=clf) model.fit(X_train, y_train, n_neighbors=3, n_jobs=10, max_models=15, similarity_kernel='rbf', verbose=False) ROC_semiboost.append(roc_auc_score(model.predict(X_test), y_test)) ''' BASE CLASSIFIER ''' model = clf XX = X_train[~random_unlabeled_points, ] yy = y_train[~random_unlabeled_points] model.fit(XX, yy) ROC_clf.append(roc_auc_score(model.predict(X_test), y_test)) return (np.mean(np.array(ROC_semiboost) - np.array(ROC_clf)), np.std(np.array(ROC_semiboost) - np.array(ROC_clf)))
def simulate_normal_clusters(N, ndim, centers=4, center_box=(-8, 8), **kwds): return make_blobs(N, ndim, centers=centers, center_box=center_box, **kwds)
# K Means Outlier Detection On Make_Blobs DataSet # Generate a single blob of 100 points # Identify the five points that are furthest from the centroid from sklearn.datasets import make_blobs X, labels = make_blobs(100, centers=1) # The k means should have a single center for most occassions from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=1) kmeans.fit(X) KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300, n_clusters=1, n_init=10, n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001, verbose=0) # Visualize the blobs with a scatter plot to see the centroid import matplotlib.pyplot as plt f, ax = plt.subplots(figsize=(8, 5)) ax.set_title("Blob") ax.scatter(X[:, 0], X[:, 1], label='Points') ax.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], label='Centroid',
import matplotlib.pyplot as plt def plot_data(X, y, figsize=None): if not figsize: figsize = (8, 6) plt.figure(figsize=figsize) plt.plot(X[y==0, 0], X[y==0, 1], 'or', alpha=0.5, label=0) plt.plot(X[y==1, 0], X[y==1, 1], 'ob', alpha=0.5, label=1) plt.xlim((min(X[:, 0])-0.1, max(X[:, 0])+0.1)) plt.ylim((min(X[:, 1])-0.1, max(X[:, 1])+0.1)) plt.legend() X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, center_box=(-10.0, 10.0), shuffle=True, random_state=1) plot_data(X, y) kmeans_model = cluster.KMeans(n_clusters=2, random_state=1) kmeans_model.fit(X) kmeans_model.cluster_centers_ kmeans_model.labels_ #metrics when target labels are not known silhouette_avg = metrics.silhouette_score(X,kmeans_model.labels_,metric='euclidean') print(silhouette_avg) silhouette_samples = metrics.silhouette_samples(X,kmeans_model.labels_,metric='euclidean') print(silhouette_samples)
ax.set_xlabel('X') ax.set_ylabel('Y') markers = ['o', 'd', '^', 'x', '1', '2', '3', 's'] colors = ['r', 'b', 'g', 'c', 'm', 'k', 'y', '#cccfff'] for i in range(nb_samples): ax.scatter(X[i, 0], X[i, 1], marker=markers[Y[i]], color=colors[Y[i]]) plt.show() if __name__ == '__main__': # Create the dataset X, Y = make_blobs(n_samples=nb_samples, n_features=2, centers=8, cluster_std=2.0) # Show the dataset fig, ax = plt.subplots(1, 1, figsize=(10, 8)) ax.grid() ax.set_xlabel('X') ax.set_ylabel('Y') ax.scatter(X[:, 0], X[:, 1], marker='o', color='b') plt.show() # Complete linkage print('Complete linkage') ac = AgglomerativeClustering(n_clusters=8, linkage='complete')
rowSums = np.sum(self.affMat, axis=1) dmax = np.max(rowSums) D = np.diag(rowSums) L = (self.affMat + dmax * np.eye(D.shape[0]) - D) / dmax values, vectors = np.linalg.eig(L) assert np.all(np.isreal(values)) bigEigInd = np.argsort(-values) return vectors[:, bigEigInd[:self.n_clusters]] def apply_constraints(self): self.affMat[self.ML[:, 0], self.ML[:, 1]] = 1 self.affMat[self.CL[:, 0], self.CL[:, 1]] = 0 if __name__ == '__main__': Nclusters, N, Nconstraints = (3, 100, 40) data, labels = ds.make_blobs(n_samples=N, n_features=2, centers=Nclusters) constraintMat = ConstrainedClustering.make_constraints( labels, data=data, method='mmffqs', Nconstraints=Nconstraints, errRate=0) plt.figure() ConstrainedClustering.plot_constraints(data, constraintMat) plt.show()
import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from scipy.cluster.hierarchy import dendrogram, ward X, y = make_blobs(n_samples=12, random_state=0) linkage_array = ward(X) dendrogram(linkage_array) ax = plt.gca() bounds = ax.get_xbound() ax.plot(bounds, [7.25, 7.25], '--', c='k') ax.plot(bounds, [4, 4], '--', c='k') ax.text(bounds[1], 7.25, ' two clusters', va='center') ax.text(bounds[1], 4, ' three clusters', va='center') ax.set_xlabel('Sample index') ax.set_ylabel('Cluster distance') plt.show()
import matplotlib.pyplot as plt from sklearn import cluster, datasets from sklearn.neighbors import kneighbors_graph from sklearn.preprocessing import StandardScaler np.random.seed(0) # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times n_samples = 1500 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) clustering_names = [ 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'Ward', 'AgglomerativeClustering', 'DBSCAN', 'Birch' ] plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96,
if __name__ == '__main__': from sklearn.datasets import make_blobs import matplotlib.pyplot as plt import numpy as np import argparse # argparse.ArgumentParser(prog=None, usage=None, description=None, epilog=None, parents=[], formatter_class=argparse.HelpFormatter, prefix_chars='-', fromfile_prefix_chars=None, argument_default=None, conflict_handler='error', add_help=True, allow_abbrev=True, exit_on_error=True) parser = argparse.ArgumentParser(description='') # parser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest]) parser.add_argument('-a', '--arg1') args = parser.parse_args() # ############################################################################# # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # ############################################################################# n_clusters, labels, core_samples = DBscan(X) core_samples_mask = np.zeros_like(labels, dtype=bool) core_samples_mask[core_samples] = True # Plot result unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask]
''' @Author: Runsen @微信公众号: 润森笔记 @博客: https://blog.csdn.net/weixin_44510615 @Date: 2020/5/2 ''' from sklearn import datasets from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score, silhouette_score, calinski_harabasz_score x, y = datasets.make_blobs(400, n_features=2, centers=4, random_state=0) model = KMeans(n_clusters=4) model.fit(x) y_pred = model.predict(x) print(" 调整兰德系数: " + str(adjusted_rand_score(y, y_pred))) print(" 轮廓系数: " + str(silhouette_score(x, y_pred))) print(" CH分数: " + str(calinski_harabasz_score(x, y_pred)))
neptune.stop() ## Explore Results # Scikit-learn KMeans clustering ## Step 1: Create KMeans object and example data parameters = {'n_init': 11, 'max_iter': 270} from sklearn.datasets import make_blobs from sklearn.cluster import KMeans km = KMeans(**parameters) X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743) ## Step 2: Initialize Neptune import neptune neptune.init('shared/sklearn-integration', api_token='ANONYMOUS') ## Step 3: Create an Experiment neptune.create_experiment(params=parameters, name='clustering-example', tags=['KMeans', 'clustering']) ## Step 4: Log KMeans clustering summary
from main import mglearn, train_test_split, plt, np from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from scipy.cluster.hierarchy import dendrogram, ward X, y = make_blobs(random_state=0, n_samples=12) linkage_array = ward(X) dendrogram(linkage_array) ax = plt.gca() bounds = ax.get_xbound() ax.plot(bounds, [7.25, 7.25], '--', c='k') ax.plot(bounds, [4, 4], '--', c='k') ax.text(bounds[1], 7.25, 'two clusters', va='center', fontdict={"size": 15}) ax.text(bounds[1], 4, 'three clusters', va='center', fontdict={"size": 15}) # agg = AgglomerativeClustering(n_clusters=3) # assignment = agg.fit_predict(X) # mglearn.discrete_scatter(X[:, 0], X[:, 1], assignment) plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.show()
# plt.show() RS = 11 name = 'wq_random_project' target = 'quality' train = pd.read_csv(f'wine_train.csv') test = pd.read_csv(f'wine_test.csv') full = pd.concat([train, test]) y = np.array(train.loc[:, target]) X = np.array(train.drop(target, axis=1)) name = 'gb_random_project' X, y = make_blobs(centers=6, n_features=2, n_samples=1000, random_state=11) n_pairs = 100 np.random.seed(RS) sample_idxs = np.random.choice(range(X.shape[0]), size=2 * n_pairs, replace=False) x_vals = np.array(range(1, X.shape[1] + 1)) y_vals = [] for n_components in x_vals: print(X.shape[1], '->', n_components) np.random.seed(RS) best_mean = np.inf for rs in np.random.choice(range(1000), size=5, replace=False): transformer = GaussianRandomProjection(random_state=rs, n_components=n_components)
ap.add_argument("-a", "--alpha", type=float, default=0.01, help="learning rate") ap.add_argument("-b", "--batch-size", type=int, default=32, help="size of SGD mini-batches") args = vars(ap.parse_args()) # generate a 2-class classification problem with 1000 data points, where each data point is a 2D feature vector (X, y) = make_blobs(n_samples=1000, n_features=2, centers=2, cluster_std=1.5, random_state=1) y = y.reshape((y.shape[0], 1)) # insert a column of 1's as the last entry in the feature matrix -- this little trick allows us to treat # the bias as a trainable parameter within the weight matrix X = np.c_[X, np.ones((X.shape[0]))] # partition the data into training and testing splits using 50% of # the data for training and the remaining 50% for testing (trainX, testX, trainY, testY) = train_test_split(X, y, test_size=0.5, random_state=42)
self.means = np.zeros([self.kk, self.mm]) for n in range(self.nn): self.means[self.assign[n]] += self.data[n] for k in range(self.kk): self.means[k] /= self.count[k] if __name__ == "__main__": features, target = make_blobs(n_samples=1000, n_features=2, centers=3, cluster_std=0.5, shuffle=True, random_state=1) #plt.scatter(features[:, 0], features[:, 1], c = target) #plt.show() centers = np.array([[2.0, 2.0], [-1.0, -5.0], [-5.0, -1.0]]) model = Kmeans(data=features, means=centers) plt.scatter(features[:, 0], features[:, 1], c=target) plt.show() plt.scatter(features[:, 0], features[:, 1], c=model.assign)
return bool(random.getrandbits(1)) # A cluster generator creates a cluster in the range [-1, 1]. It return a tuple (points, cluster_indices) cluster_generators = [] cluster_generators.append( (2, lambda samples: make_circles(n_samples=samples, noise=random.uniform(0, 0.08), factor=random.uniform(0.1, 0.4)))) circles = make_circles(n_samples=15, noise=random.uniform(0, 0.08), factor=random.uniform(0.1, 0.4)) for i in range(10): data = make_blobs(50, centers=5) data = make_moons(n_samples=101, noise=random.uniform(0, 0.08)) data_points = data[0] if rand_bool(): data_points = -data_points data_points = rotate_2d(data_points, random.uniform(0, np.pi * 2)) data_points = rescale_data(data_points, [0, 1], [0, 1]) data = (data_points, data[1]) plot_data(data) # for i in range(10): # plot_data(cluster_generators[0](40))
# 导入高斯朴素贝叶斯 from sklearn.naive_bayes import GaussianNB # 导入画图工具 import matplotlib.pyplot as plt # 导入数据集生成工具 from sklearn.datasets import make_blobs # 导入数据集拆分工具 from sklearn.model_selection import train_test_split import numpy as np # 生成样本数量为500, 分类数为5的数据集 X, y = make_blobs(n_samples=500, centers=5, random_state=8) # 将数据集拆分成训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8) # 使用高斯朴素贝叶斯 gnb = GaussianNB() gnb.fit(X_train, y_train) print('\n代码运行结果: ') print('训练集数据得分:{:.3f}'.format(gnb.score(X_train, y_train))) print('测试及数据得分:{:.3f}'.format(gnb.score(X_test, y_test))) # 限定横轴与纵轴的最大值 x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 # 用不同的背景色表示不同的分类 xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02)) z = gnb.predict(np.c_[(xx.ravel(), yy.ravel())]).reshape(xx.shape) plt.pcolormesh(xx, yy, z, cmap=plt.cm.Spectral)
def plot_classification(clf=SVC(probability=True), n_features=2, n_samples=1000, ratio_unsampled=0.99, data_simulation='make_classification'): ''' SIMULATE SEMI SUPERVISED DATASET ''' if data_simulation == 'make_classification': X, y = make_classification(n_features=n_features, n_samples=n_samples, n_redundant=0, n_clusters_per_class=1) elif data_simulation == 'make_blobs': X, y = make_blobs(n_features=n_features, centers=2, n_samples=n_samples) elif data_simulation == 'make_gaussian_quantiles': X, y = make_gaussian_quantiles(n_features=n_features, n_classes=2, n_samples=n_samples) elif data_simulation == 'make_moons': X, y = make_moons(n_samples=n_samples) elif data_simulation == 'make_circles': X, y = make_circles(n_samples=n_samples) else: print('Unknown data simulation method') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) labels = np.copy(y_train) labels[labels == 0] = -1 # create some unlabeled data random_unlabeled_points = np.random.rand(len(y_train)) < ratio_unsampled labels[random_unlabeled_points] = 0 y_train = labels ''' SEMIBOOST SKLEARN STYLE ''' model = SemiBoost.SemiBoostClassifier(base_model=clf) model.fit(X_train, y_train, n_jobs=10, max_models=10, similarity_kernel='rbf', verbose=False) ''' Plot ''' gs = gridspec.GridSpec(1, 2) fig = plt.figure(figsize=(10, 8)) ax = plt.subplot(gs[0, 0]) fig = plot_decision_regions(X=X_test, y=y_test, clf=model, legend=2) plt.title('SemiBoost') ''' BASE CLASSIFIER ''' basemodel = clf XX = X_train[~random_unlabeled_points, ] yy = y_train[~random_unlabeled_points] basemodel.fit(XX, yy) ''' Plot ''' ax = plt.subplot(gs[0, 1]) fig = plot_decision_regions(X=X_test, y=y_test, clf=basemodel, legend=2) plt.title('BaseModel') plt.show()
def __init__(self, model, population_size, n_blobs, n_features, home_district_in_position, iseed=None): self.model = model self.roulette_distribution = {} self.feature_vector = {} self.vector_to_human = {} self.vector_to_home = {} self.vector_to_classroom = {} self.vector_to_office = {} self.vector_to_restaurant = {} self.unit_info_map = self.unit_info_map() n_vec = population_size blobs, assignments = make_blobs( n_samples=n_vec, n_features=n_features, centers=n_blobs, cluster_std=0.1, #1.0 center_box=(-10.0, 10.0), shuffle=False, random_state=iseed) self.n_blobs = n_blobs self.home_district_in_position = home_district_in_position self.blob_dict = {} for vec, assignment in zip(blobs, assignments): if assignment not in self.blob_dict: self.blob_dict[assignment] = [] self.blob_dict[assignment].append(vec) self.vectors = blobs #self.vectors = KeyedVectors(n_features) #numlist = range(n_vec) #self.vectors.add(numlist,blobs[:]) #for i in range(n_vec): #self.vectors.add_vector(i, blobs[i,:]) #vectors.add_vector(str(i), blobs[i,:]) #print (numlist) #print(blobs) #print (self.vectors) for i in range(n_vec): #vector1 = self.vectors.get_vector(i) vector1 = self.vectors[i] tuple_vec1 = tuple(vector1) similarities = KeyedVectors.cosine_similarities( vector1, self.vectors) #print (distances) #distances = self.vectors.cosine_similarities(vector1,self.vectors) #self.roulette_distribution[tuple_vec1] = {} temp = {} sum_similarities = (similarities - similarities.min()).sum() for j in range(n_vec): if i != j: vector2 = self.vectors[j] tuple_vec2 = tuple(vector2) temp[tuple_vec2] = (similarities[j] - similarities.min()) / sum_similarities self.roulette_distribution[tuple_vec1] = dict( sorted(temp.items(), key=lambda item: -item[1]))
from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_blobs import matplotlib.pyplot as plt np.random.seed(123) X, y = make_blobs(n_samples=1000, n_features=10, centers=5, cluster_std=3) RFC = RandomForestClassifier(n_estimators=80, oob_score=True) RFC.fit(X, y) print("oob_score_", RFC.oob_score_) _x0 = np.random.randn(10) sample_gen = GenerativeSampler(model=RFC, target_class=0, class_err_prob=1 - RFC.oob_score_, use_empirical=False) test = sample_gen.run_chain(n=10, x0=_x0) # Test that class_err_prob self populates correctly sample_gen = GenerativeSampler(model=RFC, X=X, y=y, target_class=0, use_empirical=False) #assert sample_gen.class_err_prob == 0 print( "calculated class_err_prob", sample_gen.class_err_prob ) # For RFC this will always be 0 because it's calculated against the training data. test = sample_gen.run_chain(n=10, x0=_x0) # test that x0 self populates correctly
#!/usr/bin/python # -*- coding: utf-8 -*- #[email protected] """ 层次聚类 自低向上,初始中,每个点作为一类。 """ print(__doc__) from sklearn.datasets import make_moons, make_circles, make_blobs from sklearn.cluster import AgglomerativeClustering import numpy as np import matplotlib.pyplot as plt centers = [[0, 1], [-1, -1], [1, -1]] X, y = make_blobs(n_samples=1500, random_state=170) trs = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] X = np.dot(X, trs) """ 层次聚类 =============== 参数: n_clusters:一个整数,指定分类簇的数量 linkage:一个字符串,用于指定链接算法 ‘ward’:单链接single-linkage,采用dmindmin ‘complete’:全链接complete-linkage算法,采用dmaxdmax ‘average’:均连接average-linkage算法,采用davgdavg affinity:一个字符串或者可调用对象,用于计算距离 """ clt = AgglomerativeClustering(linkage="ward")
import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import make_blobs sns.set() # for plot styling features, labels = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0) plt.scatter(features[:, 0], features[:, 1], s=50) plt.show()
# scatter plot of blobs dataset from sklearn.datasets import make_blobs from matplotlib import pyplot from numpy import where # generate 2d classification dataset X, y = make_blobs(n_samples=500, centers=3, n_features=2, cluster_std=2, random_state=2) # scatter plot for each class value for class_value in range(3): # select indices of points with the class label row_ix = where(y == class_value) # scatter plot for points with a different color pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) # show plot pyplot.show()
from sklearn.datasets import make_blobs from sklearn.naive_bayes import GaussianNB from sklearn.metrics import brier_score_loss from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import train_test_split n_samples = 50000 n_bins = 3 # use 3 bins for calibration_curve as we have 3 clusters here # Generate 3 blobs with 2 classes where the second blob contains # half positive samples and half negative samples. Probability in this # blob is therefore 0.5. centers = [(-5, -5), (0, 0), (5, 5)] X, y = make_blobs(n_samples=n_samples, centers=centers, shuffle=False, random_state=42) y[:n_samples // 2] = 0 y[n_samples // 2:] = 1 sample_weight = np.random.RandomState(42).rand(y.shape[0]) # split train, test for calibration X_train, X_test, y_train, y_test, sw_train, sw_test = \ train_test_split(X, y, sample_weight, test_size=0.9, random_state=42) # Gaussian Naive-Bayes with no calibration clf = GaussianNB() clf.fit(X_train, y_train) # GaussianNB itself does not support sample-weights prob_pos_clf = clf.predict_proba(X_test)[:, 1] # Gaussian Naive-Bayes with isotonic calibration
import matplotlib.colors from sklearn.cluster import DBSCAN from sklearn.preprocessing import StandardScaler def expand(a, b): d = (b - a) * 0.1 return a - d, b + d if __name__ == "__main__": N = 1000 centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]] data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0) data = StandardScaler().fit_transform(data) # 数据1的参数:(epsilon, min_sample) params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15)) # 数据2 # t = np.arange(0, 2*np.pi, 0.1) # data1 = np.vstack((np.cos(t), np.sin(t))).T # data2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T # data3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T # data = np.vstack((data1, data2, data3)) # # # 数据2的参数:(epsilon, min_sample) # params = ((0.5, 3), (0.5, 5), (0.5, 10), (1., 3), (1., 10), (1., 20))
from sklearn.datasets import make_blobs from matplotlib import pyplot as plt import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.svm import SVC X, y = make_blobs(n_samples=125, centers=2, cluster_std=0.60, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) # plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap="winter") # plt.show() model = SVC(kernel='linear') history = model.fit(X_train, y_train) # ax = plt.gca() # xlim = ax.get_xlim() # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap="winter", marker='s') # w = model.coef_[0] # a = -w[0] / w[1] # xx = np.linspace(xlim[0], xlim[1]) # yy = a * xx - (model.intercept_[0] / w[1]) # plt.plot(xx, yy) plt.show()