def test_naivebayesclassifier(self): np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train) X_test = texttransform.transform(X_test) mymodel = NaiveBayesClassifier() mymodel.fit(X_train, y_train) scores = mymodel.predict(X_test) accuracy = np.mean(y_test == [i for i in scores])[0] assert_greater( accuracy, 0.5, "accuracy should be greater than %s" % 0.5)
def test_pass_decision_function_multiclass_with_pipeline(self): assert_almost_equal(decfun_sum(Pipeline([NaiveBayesClassifier()])), -96.87325, decimal=4, err_msg=invalid_decision_function_output)
def test_fail_predict_proba_multiclass_with_pipeline(self): check_unsupported_predict_proba(self, Pipeline([NaiveBayesClassifier()]), X_train, y_train, X_test)
from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.naive_bayes import NaiveBayesClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import train_test_split # use 'wiki_detox_train' data set to create test and train data # Sentiment SentimentText # 1 ==RUDE== Dude, you are rude upload that carl picture back, or else. # 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split(train, label) # map text reviews to vector space texttransform = NGramFeaturizer(word_feature_extractor=Ngram(), vector_normalizer='None') << 'SentimentText' nb = NaiveBayesClassifier(feature=['SentimentText']) ppl = Pipeline([texttransform, nb]) ppl.fit(X_train, y_train) scores = ppl.predict(X_test)['PredictedLabel'] # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
'LpScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, LpScaler(columns={'normed_columns': 'concated_columns'}) ]), 'MutualInformationSelector': Pipeline([ ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}), MutualInformationSelector( columns='Features', label='Label', slots_in_output=2) # only accept one column ]), 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(), char_feature_extractor=Ngram(), keep_diacritics=True, columns={ 'features': ['SentimentText']}), 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']), 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']), 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \ OneVsRestClassifier(AveragedPerceptronBinaryClassifier(), use_probabilities=True, feature=['age', 'education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs'], label='induced'), 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # PcaTransformer(), # REVIEW: crashes GamBinaryClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView GamRegressor( ), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView LightGbmClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # LightGbmRanker(), # REVIEW: crashes
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), NaiveBayesClassifier(feature=['age', 'edu'], label='induced') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 # 0 2 -5.297264 -5.873055 -4.847996 # 1 2 -5.297264 -5.873055 -4.847996 # 2 2 -5.297264 -5.873055 -4.847996 # 3 2 -5.297264 -5.873055 -4.847996 # 4 0 -1.785266 -3.172440 -3.691075