def test_defaults(self): schema = DataSchema.read_schema(infert_file, numeric_dtype=np.float32) data = FileDataStream.read_csv(infert_file, schema=schema) pipeline_steps = [ OneHotVectorizer(columns={'edu': 'education'}), KMeansPlusPlus( n_clusters=5, feature=['edu', 'age', 'parity', 'spontaneous', 'stratum']) ] check_cv(pipeline_steps, data)
def test_get_fit_info_clustering(self): X_train = pandas.DataFrame( data=dict(x=[0, 1, 2, 10, 11, 12, -10, -11, -12], y=[0, 1, 2, 10, 11, 12, -10, -11, -12], z=[0, 1, 2, 10, 11, 12, -10, -11, -12])) y_train = pandas.DataFrame(data=dict( clusterid=[0, 0, 0, 1, 1, 1, 2, 2, 2])) pipeline = Pipeline([KMeansPlusPlus(n_clusters=3)]) pipeline.fit(X_train, y_train, verbose=0) scores = pipeline.predict(X_train) info = pipeline.get_fit_info(X_train, y_train) last = info[0][-1] out = last['outputs'] assert out == ['PredictedLabel', 'Score.0', 'Score.1', 'Score.2'] assert len(scores) == 9
def test_score_clusterer(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = KMeansPlusPlus(n_clusters=2, init_algorithm="Random", train_threads=1) e = Pipeline([lr]) e.fit(X_train, y_train.to_frame()) metrics = e.score(X_test, y_test) print(metrics) assert_almost_equal(metrics, 0.36840763005544264, decimal=5, err_msg="NMI loss should be %s" % 0.36840763005544264)
def test_non_label_based_predictor_does_not_have_label_column_automatically_removed( self): train_data = { 'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1] } train_df = pd.DataFrame(train_data) predictor = KMeansPlusPlus(n_clusters=5) pipeline = Pipeline([predictor]) result = json.loads(pipeline.fit(train_df, dry_run=True)) nodes = result['nodes'] self.assertEqual(nodes[0]["Name"], "Transforms.FeatureCombiner") if six.PY2: self.assertItemsEqual(nodes[0]["Inputs"]["Features"], ['c1', 'c2', 'c3', 'Label']) else: self.assertCountEqual(nodes[0]["Inputs"]["Features"], ['c1', 'c2', 'c3', 'Label']) self.assertEqual(nodes[1]["Name"], "Trainers.KMeansPlusPlusClusterer") self.assertEqual(nodes[1]["Inputs"]["FeatureColumnName"], "Features")
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), KMeansPlusPlus(n_clusters=5, feature=['induced', 'edu', 'parity']) ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline \ .fit(data) \ .test(data, 'induced', output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 Score.3 Score.4 # 0 4 2.732253 2.667988 2.353899 2.339244 0.092014 # 1 4 2.269290 2.120064 2.102576 2.222578 0.300347 # 2 4 3.482253 3.253153 2.425328 2.269245 0.258680 # 3 4 3.130401 2.867317 2.158132 2.055911 0.175347
'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature', 'check_dont_overwrite_parameters', 'check_supervised_y_2d', 'check_estimators_fit_returns_self', 'check_estimators_overwrite_params', 'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'EnsembleClassifier': EnsembleClassifier(num_models=3), 'EnsembleRegressor': EnsembleRegressor(num_models=3), 'FactorizationMachineBinaryClassifier': FactorizationMachineBinaryClassifier(shuffle=False), 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=2), 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRanker': LightGbmRanker(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SgdBinaryClassifier':
FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) ]), 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}), 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']), 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']), 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}), 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']), 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group'), 'Loader': Loader(columns={'ImgPath': 'Path'}), 'LpScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, LpScaler(columns={'normed_columns': 'concated_columns'}) ]), 'MutualInformationSelector': Pipeline([ ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}), MutualInformationSelector(
FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # PcaTransformer(), # REVIEW: crashes GamBinaryClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView GamRegressor( ), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView LightGbmClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # LightGbmRanker(), # REVIEW: crashes # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView OneVsRestClassifier(FastLinearBinaryClassifier()),
############################################################################### # KMeansPlusPlus import pandas from nimbusml import Pipeline from nimbusml.cluster import KMeansPlusPlus # define 3 clusters with centroids (1,1,1), (11,11,11) and (-11,-11,-11) X_train = pandas.DataFrame(data=dict(x=[0, 1, 2, 10, 11, 12, -10, -11, -12], y=[0, 1, 2, 10, 11, 12, -10, -11, -12], z=[0, 1, 2, 10, 11, 12, -10, -11, -12])) # these should clearly belong to just 1 of the 3 clusters X_test = pandas.DataFrame(data=dict(x=[-1, 3, 9, 13, -13, -20], y=[-1, 3, 9, 13, -13, -20], z=[-1, 3, 9, 13, -13, -20])) y_test = pandas.DataFrame(data=dict(clusterid=[2, 2, 1, 1, 0, 0])) pipe = Pipeline([KMeansPlusPlus(n_clusters=3)]).fit(X_train) metrics, predictions = pipe.test(X_test, y_test, output_scores=True) # print predictions print(predictions.head()) # print evaluation metrics print(metrics)