def test_timings(self): """ Test the twinx double axes with k-elbow timings """ visualizer = KElbowVisualizer(KMeans(random_state=0), k=5, timings=True, locate_elbow=False) visualizer.fit(self.clusters.X) # Check that we kept track of time assert len(visualizer.k_timers_) == 4 assert all([t > 0 for t in visualizer.k_timers_]) # Check that we plotted time on a twinx assert hasattr(visualizer, "axes") assert len(visualizer.axes) == 2 # delete the timings axes and # overwrite k_timers_, k_values_ for image similarity Tests visualizer.axes[1].remove() visualizer.k_timers_ = [ 0.01084589958190918, 0.011144161224365234, 0.017028093338012695, 0.010634183883666992, ] visualizer.k_values_ = [2, 3, 4, 5] # call draw again which is normally called in fit visualizer.draw() visualizer.finalize() self.assert_images_similar(visualizer)
def test_invalid_k(self): """ Assert that invalid values of K raise exceptions """ with self.assertRaises(YellowbrickValueError): model = KElbowVisualizer(KMeans(), k=(1, 2, 3, 4, 5)) with self.assertRaises(YellowbrickValueError): model = KElbowVisualizer(KMeans(), k="foo")
def test_invalid_k(self): """ Assert that invalid values of K raise exceptions """ with pytest.raises(YellowbrickValueError): KElbowVisualizer(KMeans(), k=(1, 2, 3, "foo", 5)) with pytest.raises(YellowbrickValueError): KElbowVisualizer(KMeans(), k="foo")
def test_timings(self): """ Test the twinx double axes with k-elbow timings """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, timings=True ) visualizer.fit(X) # Check that we kept track of time assert len(visualizer.k_timers_) == 4 assert all([t > 0 for t in visualizer.k_timers_]) # Check that we plotted time on a twinx assert hasattr(visualizer, "axes") assert len(visualizer.axes) == 2 # delete the timings axes and # overwrite k_timers_, k_values_ for image similarity Tests visualizer.axes[1].remove() visualizer.k_timers_ = [ 0.01084589958190918, 0.011144161224365234, 0.017028093338012695, 0.010634183883666992 ] visualizer.k_values_ = [2, 3, 4, 5] # call draw again which is normally called in fit visualizer.draw() visualizer.poof() self.assert_images_similar(visualizer)
def test_calinski_harabaz_metric(self): """ Test the calinski-harabaz metric of the k-elbow visualizer """ visualizer = KElbowVisualizer(KMeans(), k=5, metric="calinski_harabaz") visualizer.fit(X) expected = [ 81.662726256035683, 50.992378259195554, 40.952179227847012, 37.068658049555459 ] self.assertEqual(len(visualizer.k_scores_), 4)
def test_silhouette_metric(self): """ Test the silhouette metric of the k-elbow visualizer """ visualizer = KElbowVisualizer(KMeans(), k=5, metric="silhouette") visualizer.fit(X) expected = [ 0.69163638040000031, 0.4534779796676191, 0.24802958481973392, 0.21792458448172247 ] self.assertEqual(len(visualizer.k_scores_), 4)
def test_distortion_metric(self): """ Test the distortion metric of the k-elbow visualizer """ visualizer = KElbowVisualizer(KMeans(), k=5, metric="distortion") visualizer.fit(X) expected = [ 7.6777850157143783, 8.3643185158057669, 9.5203330222217666, 8.9777589843618912 ] self.assertEqual(len(visualizer.k_scores_), 4)
def test_timings(self): """ Test the twinx double axes with k-elbow timings """ visualizer = KElbowVisualizer(KMeans(), k=5, timings=True) visualizer.fit(X) # Check that we kept track of time self.assertEqual(len(visualizer.k_timers_), 4) self.assertTrue(all([t > 0 for t in visualizer.k_timers_])) # Check that we plotted time on a twinx self.assertTrue(hasattr(visualizer, "axes")) self.assertEqual(len(visualizer.axes), 2)
def test_topic_modeling_k_means(self): """ Test topic modeling k-means on the hobbies corpus """ corpus = self.load_corpus("hobbies") tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) visualizer = KElbowVisualizer(KMeans(), k=(4, 8)) visualizer.fit(docs) visualizer.poof() self.assert_images_similar(visualizer)
def test_silhouette_metric(self): """ Test the silhouette metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="silhouette", timings=False ) visualizer.fit(X) expected = np.array([ 0.691636, 0.456646, 0.255174, 0.239842]) assert len(visualizer.k_scores_) == 4 visualizer.poof() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_distortion_metric(self): """ Test the distortion metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="distortion", timings=False ) visualizer.fit(X) expected = np.array([ 7.677785, 8.364319, 8.893634, 8.013021]) assert len(visualizer.k_scores_) == 4 visualizer.poof() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_no_knee(self): """ Assert that a warning is issued if there is no knee detected """ X, y = make_blobs(n_samples=1000, centers=3, n_features=12, random_state=12) message = ("No 'knee' or 'elbow point' detected " "This could be due to bad clustering, no " "actual clusters being formed etc.") with pytest.warns(YellowbrickWarning, match=message): visualizer = KElbowVisualizer(KMeans(random_state=12), k=(4, 12), locate_elbow=True) visualizer.fit(X)
def test_valid_k(self): """ Assert that valid values of K generate correct k_values_: if k is an int, k_values_ = range(2, k+1) if k is a tuple of 2 ints, k_values = range(k[0], k[1]) if k is an iterable, k_values_ = list(k) """ visualizer = KElbowVisualizer(KMeans(), k=8) assert visualizer.k_values_ == list(np.arange(2, 8+1)) visualizer = KElbowVisualizer(KMeans(), k=(4, 12)) assert visualizer.k_values_ == list(np.arange(4, 12)) visualizer = KElbowVisualizer(KMeans(), k=np.arange(10, 100, 10)) assert visualizer.k_values_ == list(np.arange(10, 100, 10)) visualizer = KElbowVisualizer(KMeans(), k=[10, 20, 30, 40, 50, 60, 70, 80, 90]) assert visualizer.k_values_ == list(np.arange(10, 100, 10))
def test_calinski_harabaz_metric(self): """ Test the calinski-harabaz metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="calinski_harabaz", timings=False ) visualizer.fit(X) assert len(visualizer.k_scores_) == 4 expected = np.array([ 81.662726256035683, 50.992378259195554, 40.952179227847012, 35.939494 ]) visualizer.poof() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_locate_elbow(self): """ Test the addition of locate_elbow to an image """ X, y = make_blobs(n_samples=1000, n_features=5, centers=3, shuffle=True, random_state=42) visualizer = KElbowVisualizer( KMeans(random_state=0), k=6, metric="calinski_harabasz", timings=False, locate_elbow=True, ) visualizer.fit(X) assert len(visualizer.k_scores_) == 5 assert visualizer.elbow_value_ == 3 expected = np.array( [4286.479848, 12463.383743, 8766.999551, 6950.08391, 5865.79722]) visualizer.finalize() self.assert_images_similar(visualizer, windows_tol=2.2) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_integrated_mini_batch_kmeans_elbow(self): """ Test no exceptions for mini-batch kmeans k-elbow visualizer """ # NOTE #182: cannot use occupancy dataset because of memory usage # Generate a blobs data set X, y = make_blobs(n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42) try: _, ax = plt.subplots() visualizer = KElbowVisualizer(MiniBatchKMeans(random_state=42), k=4, ax=ax) visualizer.fit(X) visualizer.finalize() self.assert_images_similar(visualizer) except Exception as e: pytest.fail("error during k-elbow: {}".format(e))
def test_sample_weights(self): """ Test that passing in sample weights correctly influences the clusterer's fit """ seed = 1234 # original data has 5 clusters X, y = make_blobs( n_samples=[5, 30, 30, 30, 30], n_features=5, random_state=seed, shuffle=False, ) visualizer = KElbowVisualizer(KMeans(random_state=seed), k=(2, 12), timings=False) visualizer.fit(X) assert visualizer.elbow_value_ == 5 # weights should push elbow down to 4 weights = np.concatenate([np.ones(5) * 0.0001, np.ones(120)]) visualizer.fit(X, sample_weight=weights) assert visualizer.elbow_value_ == 4
def test_calinski_harabasz_metric(self): """ Test the calinski-harabasz metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="calinski_harabasz", timings=False, locate_elbow=False, ) visualizer.fit(self.clusters.X) assert len(visualizer.k_scores_) == 4 assert visualizer.elbow_value_ is None expected = np.array([ 81.66272625603568, 50.992378259195554, 39.573201061900455, 37.06865804955547, ]) visualizer.finalize() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def TrainModel(self): print(self.data_train.columns) self.listWidget_data_train.clear() self.columnsRemove.clear() save_location = self.GetLocation(operation='save', caption="Save as", filter="JobLib Files(*.joblib)") if save_location != '': print(save_location, 'model train start') #train model self.data_train.dropna(inplace=True) self.data_train.drop_duplicates(inplace=True) X = pd.get_dummies(self.data_train) kmeans = KMeans(init='k-means++', max_iter=300, n_init=10, random_state=4) scaler = MinMaxScaler() scaled_features = scaler.fit_transform(X) visualizer = KElbowVisualizer(kmeans, k=(4, 12), metric='silhouette', timings=False) visualizer.fit(X) if (not visualizer.elbow_value_): clusterValue = 3 else: clusterValue = visualizer.elbow_value_ kmeans = KMeans(max_iter=300, n_init=10, random_state=4, n_clusters=clusterValue) print(clusterValue) kmeans.fit(scaled_features) #save model dump(kmeans, save_location + '.joblib') print('model train done')
def test_integrated_kmeans_elbow(self): """ Test no exceptions for kmeans k-elbow visualizer on blobs dataset """ # NOTE #182: cannot use occupancy dataset because of memory usage # Generate a blobs data set X,y = make_blobs( n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42 ) try: _, ax = plt.subplots() visualizer = KElbowVisualizer(KMeans(random_state=42), k=4, ax=ax) visualizer.fit(X) visualizer.poof() self.assert_images_similar(visualizer) except Exception as e: pytest.fail("error during k-elbow: {}".format(e))
def test_topic_modeling_k_means(self): """ Test topic modeling k-means on the hobbies corpus """ corpus = load_hobbies() tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) visualizer = KElbowVisualizer(KMeans(), k=(4, 8)) visualizer.fit(docs) visualizer.finalize() self.assert_images_similar(visualizer)
def test_distortion_metric(self): """ Test the distortion metric of the k-elbow visualizer """ visualizer = KElbowVisualizer(KMeans(random_state=0), k=5, metric="distortion", timings=False) visualizer.fit(X) expected = np.array([7.677785, 8.364319, 8.893634, 8.013021]) self.assertEqual(len(visualizer.k_scores_), 4) visualizer.poof() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_silhouette_metric(self): """ Test the silhouette metric of the k-elbow visualizer """ visualizer = KElbowVisualizer(KMeans(random_state=0), k=5, metric="silhouette", timings=False) visualizer.fit(X) expected = np.array([0.691636, 0.456646, 0.255174, 0.239842]) self.assertEqual(len(visualizer.k_scores_), 4) visualizer.poof() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_calinski_harabaz_metric(self): """ Test the calinski-harabaz metric of the k-elbow visualizer """ visualizer = KElbowVisualizer(KMeans(random_state=0), k=5, metric="calinski_harabaz", timings=False) visualizer.fit(X) expected = np.array([ 81.662726256035683, 50.992378259195554, 40.952179227847012, 35.939494 ]) self.assertEqual(len(visualizer.k_scores_), 4) visualizer.poof() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_distortion_metric(self): """ Test the distortion metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="distortion", timings=False, locate_elbow=False, ) visualizer.fit(self.clusters.X) expected = np.array([69.100065, 54.081571, 43.146921, 34.978487]) assert len(visualizer.k_scores_) == 4 visualizer.finalize() self.assert_images_similar(visualizer, tol=0.03) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_integrated_mini_batch_kmeans_elbow(self): """ Test no exceptions for mini-batch kmeans k-elbow visualizer See #182: cannot use occupancy dataset because of memory usage """ # Generate a blobs data set X, y = make_blobs(n_samples=1000, n_features=12, centers=6, shuffle=True) try: visualizer = KElbowVisualizer(MiniBatchKMeans(), k=4) visualizer.fit(X) visualizer.poof() except Exception as e: self.fail("error during k-elbow: {}".format(e))
def test_calinski_harabasz_metric(self): """ Test the calinski-harabasz metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="calinski_harabasz", timings=False, locate_elbow=False, ) visualizer.fit(self.clusters.X) assert len(visualizer.k_scores_) == 4 assert visualizer.elbow_value_ is None expected = np.array([81.662726, 50.992378, 40.952179, 35.939494]) visualizer.finalize() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_silhouette_metric(self): """ Test the silhouette metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="silhouette", timings=False, locate_elbow=False, ) visualizer.fit(self.clusters.X) expected = np.array([ 0.6916363804000003, 0.456645663683503, 0.26918583373704463, 0.25523298106687914, ]) assert len(visualizer.k_scores_) == 4 visualizer.finalize() self.assert_images_similar(visualizer) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_integrated_kmeans_elbow(self): """ Test no exceptions for kmeans k-elbow visualizer on blobs dataset """ # NOTE #182: cannot use occupancy dataset because of memory usage # Generate a blobs data set X, y = make_blobs(n_samples=1000, n_features=12, centers=6, shuffle=True, random_state=42) try: fig = plt.figure() ax = fig.add_subplot() visualizer = KElbowVisualizer(KMeans(random_state=42), k=4, ax=ax) visualizer.fit(X) visualizer.poof() self.assert_images_similar(visualizer) except Exception as e: self.fail("error during k-elbow: {}".format(e))
def test_distortion_metric(self): """ Test the distortion metric of the k-elbow visualizer """ visualizer = KElbowVisualizer( KMeans(random_state=0), k=5, metric="distortion", timings=False, locate_elbow=False, ) visualizer.fit(self.clusters.X) expected = np.array([ 69.10006514142941, 54.081571290449936, 44.491830981793605, 33.99887993254433, ]) assert len(visualizer.k_scores_) == 4 visualizer.finalize() self.assert_images_similar(visualizer, tol=0.03) assert_array_almost_equal(visualizer.k_scores_, expected)
def test_set_colors_manually(self): """ Test the silhouette metric of the k-elbow visualizer """ oz = KElbowVisualizer( KMeans(random_state=0), k=5, ) oz.metric_color = "r" oz.timing_color = "y" oz.vline_color = "c" # Create artificial "fit" data for testing purposes oz.k_values_ = [1, 2, 3, 4, 5, 6, 7, 8] oz.k_timers_ = [6.2, 8.3, 10.1, 15.8, 21.2, 27.9, 38.2, 44.9] oz.k_scores_ = [.8, .7, .55, .48, .40, .38, .35, .30] oz.elbow_value_ = 5 oz.elbow_score_ = 0.40 # Execute drawing oz.draw() oz.finalize() self.assert_images_similar(oz, tol=3.2)
def test_bad_metric(self): """ Assert KElbow raises an exception when a bad metric is supplied """ with pytest.raises(YellowbrickValueError): KElbowVisualizer(KMeans(), k=5, metric="foo")