def test_syntax9_slots_label(self): train_reviews = pandas.DataFrame(data=dict( review=[ "This is great", "I hate it", "Love it", "Do not like it", "Really like it", "I hate it", "I like it a lot", "I kind of hate it", "I do like it", "I really hate it", "It is very good", "I hate it a bunch", "I love it a bunch", "I hate it", "I like it very much", "I hate it very much.", "I really do love it", "I really do hate it", "Love it!", "Hate it!", "I love it", "I hate it", "I love it", "I hate it", "I love it" ], like=[ True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True ])) X = train_reviews.loc[:, train_reviews.columns != 'like'] y = train_reviews[['like']] transform_1 = NGramFeaturizer(word_feature_extractor=n_gram()) transform_2 = MutualInformationSelector() exp = Pipeline([transform_1, transform_2]) res = exp.fit_transform(X, y) assert res is not None # Scikit compatibility (Compose transforms inside Scikit Pipeline). # In this scenario, we do not provide {input, output} arguments transform_1 = NGramFeaturizer(word_feature_extractor=n_gram()) transform_2 = MutualInformationSelector(slots_in_output=2) pipe = Pipeline([transform_1, transform_2]) res = pipe.fit_transform(X, y) assert res is not None
def test_lightgbmclassifier(self): np.random.seed(0) train_file = get_dataset('wiki_detox_train').as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t', encoding="utf-8") X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train, max_slots=5000) X_test = texttransform.transform(X_test, max_slots=5000) mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0) scores = mymodel.predict(X_test) accuracy = np.mean(y_test.values.ravel() == scores.values) assert_greater( accuracy, 0.58, "accuracy should be greater than %s" % 0.58)
def test_pipeline_with_no_columns(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"]) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
def test_naivebayesclassifier(self): np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train) X_test = texttransform.transform(X_test) mymodel = NaiveBayesClassifier() mymodel.fit(X_train, y_train) scores = mymodel.predict(X_test) accuracy = np.mean(y_test == [i for i in scores])[0] assert_greater( accuracy, 0.5, "accuracy should be greater than %s" % 0.5)
def train_data_type_single(fit_X_type="dataframe", fit_Y_type=None, predict_X_type=None): data = [ "This is sentence 1", "Talk about second", "Thrid one", "Final example." ] model = NGramFeaturizer() data_with_new_type = transform_data(data, fit_X_type) model.fit(data_with_new_type) test_data_with_new_type = transform_data(data, predict_X_type) return model.transform(test_data_with_new_type)
def test_ngramfeaturizer_single(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) xf = NGramFeaturizer(word_feature_extractor=n_gram(), columns={'features': ['id', 'education']}) features = xf.fit_transform(data) assert features.shape == (248, 652)
def test_syntax9_multiple_inputs(self): df = pandas.DataFrame( dict(education1=['A', 'B', 'A', 'B', 'A'], education2=['c', 'd', 'c', 'd', 'c'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << { 'out1': ['education1', 'education2'] } output4 = ng4.fit_transform(X) assert output4.shape == (5, 13)
def test_pipeline_with_no_columns_raise(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier() ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 with self.assertRaises(RuntimeError): # Message # System.InvalidOperationException: # 'LightGBM Error, code is -1, error message is # 'Cannot construct Dataset since there are not useful features. # It should be at least two unique rows. # If the num_row (num_data) is small, # you can set min_data=1 and min_data_in_bin=1 to fix this. # Otherwise please make sure you are using the right dataset.' ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
def test_NGramFeaturizer_glove(self): # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) data = pd.DataFrame({ 'review': [ 'I like this movie', 'I don\'t like this', 'It is nice', 'I like this movie', 'I don\'t like this', 'It is nice', 'So boring' ], 'sentiment': ['pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg'] }) pipeline = Pipeline([ ('ng', NGramFeaturizer( word_feature_extractor=Ngram(), output_tokens_column_name='review_TransformedText', columns='review')), WordEmbedding(columns='review_TransformedText', model_kind='GloVe50D'), ('lr', FastLinearBinaryClassifier( feature=['review', 'review_TransformedText'], number_of_threads=1, shuffle=False)) ]) param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20]) grid = GridSearchCV(pipeline, param_grid) grid.fit(data['review'], 1 * (data['sentiment'] == 'pos')) assert grid.best_params_['lr__maximum_number_of_iterations'] == 100
def test_word_embedding(self): ds_train = pandas.DataFrame(data=dict( description=[ "This is great", "I hate it", "Love it", "Do not like it", "Really like it", "I hate it", "I like it a lot", "I kind of hate it", "I do like it", "I really hate it", "It is very good", "I hate it a bunch", "I love it a bunch", "I hate it", "I like it very much", "I hate it very much.", "I really do love it", "I really do hate it", "Love it!", "Hate it!", "I love it", "I hate it", "I love it", "I hate it", "I love it" ], like=[ True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True ])) ng = NGramFeaturizer(columns=['description'], output_tokens=True) we = WordEmbedding(columns='description_TransformedText', model_kind='Sswe') model = Pipeline([ng, we]) dot_vis = dot_export_pipeline(model, ds_train) assert 'ch1[label="<f0> description|<f1> ' \ 'description_TransformedText"' in dot_vis
def test_ngramfeaturizer(self): np.random.seed(0) train_file = get_dataset('wiki_detox_train').as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t', encoding="utf-8") X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train[:100]) sum = X_train.iloc[:].sum().sum() assert_equal(sum, 30513, "sum of all features is incorrect!")
def test_ngramfeaturizer_syntax_dict(self): train_reviews = pandas.DataFrame(data=dict( review=[ "This is great", "I hate it", "Love it", "Do not like it", "Really like it", "I hate it", "I like it a lot", "I kind of hate it", "I do like it", "I really hate it", "It is very good", "I hate it a bunch", "I love it a bunch", "I hate it", "I like it very much", "I hate it very much.", "I really do love it", "I really do hate it", "Love it!", "Hate it!", "I love it", "I hate it", "I love it", "I hate it", "I love it" ], like=[ True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True ])) test_reviews = pandas.DataFrame(data=dict(review=[ "This is great", "I hate it", "Love it", "Really like it", "I hate it", "I like it a lot", "I love it", "I do like it", "I really hate it", "I love it" ])) y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] textt = NGramFeaturizer(word_feature_extractor=n_gram()) << { 'outg': ['review'] } X = textt.fit_transform(X) assert X.shape == (25, 117) # columns ordering changed between 0.22 and 0.23 assert 'review' in (X.columns[0], X.columns[-1]) X = X.drop('review', axis=1) mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0) X_test = textt.transform(test_reviews) X_test = X_test.drop('review', axis=1) scores = mymodel.predict(X_test) # View the scores assert scores.shape == (10, )
def test_ngramfeaturizer(self): train_reviews = pandas.DataFrame(data=dict( review=[ "This is great", "I hate it", "Love it", "Do not like it", "Really like it", "I hate it", "I like it a lot", "I kind of hate it", "I do like it", "I really hate it", "It is very good", "I hate it a bunch", "I love it a bunch", "I hate it", "I like it very much", "I hate it very much.", "I really do love it", "I really do hate it", "Love it!", "Hate it!", "I love it", "I hate it", "I love it", "I hate it", "I love it" ], like=[ True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True ])) test_reviews = pandas.DataFrame(data=dict(review=[ "This is great", "I hate it", "Love it", "Really like it", "I hate it", "I like it a lot", "I love it", "I do like it", "I really hate it", "I love it" ])) y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review' X = textt.fit_transform(X) assert X.shape == (25, 116) mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0) X_test = textt.transform(test_reviews) scores = mymodel.predict(textt.transform(test_reviews)) # View the scores assert scores.shape == (10, ) assert X_test.shape[0] == 10
def test_column_list_or_string(self): # Bug 142794 data = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) data['SentimentText'] = data['SentimentText'].astype(str) featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << { "score": 'SentimentText' } featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << { "score": ['SentimentText'] } featurizer = NGramFeaturizer(word_feature_extractor=n_gram(), columns=['SentimentText']) res1 = featurizer.fit_transform(data) featurizer = NGramFeaturizer( word_feature_extractor=n_gram()) << 'SentimentText' res2 = featurizer.fit_transform(data) assert_frame_equal(res1, res2)
def _test_sklearn_pipeline(self): train_reviews = pandas.DataFrame(data=dict( review=["This is great", "I hate it", "Love it", "Do not like it"], like=[True, False, True, False])) y = train_reviews['like'] int_y = [int(x) for x in y] X = train_reviews.loc[:, train_reviews.columns != 'like'] featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) svd = TruncatedSVD(random_state=1, n_components=5) lr = sklearn.linear_model.LogisticRegression() pipe1 = ppl([("featurizer", featurizer), ("svd", svd), ("lr", lr)]) pipe1.fit(X, int_y) pred = pipe1.predict(X) assert pred.shape == (4, )
def test_ngramfeaturizer_multi(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) try: xf = NGramFeaturizer(word_feature_extractor=n_gram(), columns={ 'features': ['id'], 'features2': ['education'] }) except TypeError as e: assert 'Only one output column is allowed' in str(e) return try: # System.InvalidCastException: 'Cannot cast # Newtonsoft.Json.Linq.JArray to Newtonsoft.Json.Linq.JToken. xf.fit_transform(data) assert False except RuntimeError: pass
def test_automl_usecase(self): # train featurization pipeline featurization_pipe = Pipeline([NGramFeaturizer(keep_diacritics=True, columns={'Features': ['SentimentText']})]) featurization_pipe.fit(train_set) # train learner pipeline learner_pipe = Pipeline([DatasetTransformer(featurization_pipe.model), OneVsRestClassifier(AveragedPerceptronBinaryClassifier(), feature=['Features'], label='Sentiment') ]) learner_pipe.fit(train_set) # Export the learner pipeline to ONNX onnx_path = get_tmp_file('.onnx') learner_pipe.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable') # Perform the transform using the standard ML.Net backend start = time.time() result_standard = learner_pipe.predict(test_set) end = time.time() print('%ss done transform using standard backend' % round(end - start, 3)) # Perform the transform using the ORT backend df_tool = DFT(onnx_path) dataset = test_set.to_df() start = time.time() result_ort = df_tool.execute(dataset, ['PredictedLabel.output', 'Score.output']) end = time.time() print('%ss done transform using ORT backend (excludes df load time)' % round(end - start, 3)) # compare the results for col_tuple in (('PredictedLabel', 'PredictedLabel.output'), ('Score.0', 'Score.output.0'), ('Score.1', 'Score.output.1'), ): col_expected = result_standard.loc[:, col_tuple[0]] col_ort = result_ort.loc[:, col_tuple[1]] check_kwargs = { 'check_names': False, 'check_exact': False, 'check_dtype': True, 'check_less_precise': True } pd.testing.assert_series_equal(col_expected, col_ort, **check_kwargs)
def test_word_embedding_example2(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, columns={'features': ['id', 'education']}), WordEmbedding(columns='features_TransformedText') ]) features = pipeline.fit_transform(data) assert features.shape == (248, 802) assert 'features_TransformedText.94' in list(features.columns)
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None): data = [ "This is sentence 1", "Talk about second", "Thrid one", "Final example." ] label = [1, 0, 1, 1] model = Pipeline([ NGramFeaturizer(), LightGbmClassifier(min_data_per_leaf=1, n_thread=1) ]) data_with_new_type = transform_data(data, fit_X_type) label_with_new_type = transform_data(label, fit_Y_type) model.fit(data_with_new_type, label_with_new_type) metrics, scores = model.test(data_with_new_type, label_with_new_type, output_scores=True) test_data_with_new_type = transform_data(data, predict_X_type) return model.predict(test_data_with_new_type), scores, metrics
def test_pipeline_name_error(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) NGramFeaturizer(word_feature_extractor=n_gram()).fit_transform( trainData[["SentimentText"]]) msg = "Parameters ['NumLeaves', 'min_data', 'min_data_in_bin', " \ "'minsplit'] are not allowed" with self.assertRaises(NameError, msg=msg): LightGbmClassifier(min_data=1, min_data_in_bin=1, min_data_per_leaf=1, minsplit=1, NumLeaves=2)
def test_word_embedding_example_dict_newname(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), # What is features_TransformedText? WordEmbedding( columns={ 'features_TransformedText2': 'features_TransformedText'}) ]) features = pipeline.fit_transform(data) assert features.shape == (248, 409)
def test_LightLda(self): topics = pandas.DataFrame(data=dict(review=[ "animals birds cats dogs fish horse", "horse birds house fish duck cats", "car truck driver bus pickup", "car truck driver bus pickup horse ", "car truck", "bus pickup", "space galaxy universe radiation", "radiation galaxy universe duck"])) pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram( ), vector_normalizer='None') << 'review', LightLda(num_topic=3)]) y = pipeline.fit_transform(topics) assert_almost_equal( y.sum().sum(), 7.000000044, decimal=8, err_msg="Sum should be %s" % 7.000000044)
def test_transform_only_pipeline_transform_method(self): p = Pipeline( [NGramFeaturizer(char_feature_extractor=None) << 'SentimentText']) p.fit(X) xf = p.transform(X) assert 'SentimentText.==rude==' in xf.columns
# data input (as a FileDataStream) path = get_dataset('wiki_detox_train').as_filepath() data = FileDataStream.read_csv(path, sep='\t') print(data.head()) # Sentiment SentimentText # 0 1 ==RUDE== Dude, you are rude upload that carl p... # 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1 Stop trolling, zapatancas, calling me a liar m... # 3 1 ==You're cool== You seem like a really cool g... # 4 1 ::::: Why are you threatening me? I'm not bein... # transform usage pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, columns={'ngram': ['SentimentText']}), WordEmbedding(columns='ngram_TransformedText') ]) # fit and transform features = pipeline.fit_transform(data) # print features print(features.head()) # Sentiment ... ngram.douchiest ngram.award. # 0 1 ... 0.0 0.0 # 1 1 ... 0.0 0.0 # 2 1 ... 0.0 0.0 # 3 1 ... 0.0 0.0 # 4 1 ... 0.0 0.0
from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.naive_bayes import NaiveBayesClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import train_test_split # use 'wiki_detox_train' data set to create test and train data # Sentiment SentimentText # 1 ==RUDE== Dude, you are rude upload that carl picture back, or else. # 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split(train, label) # map text reviews to vector space texttransform = NGramFeaturizer(word_feature_extractor=Ngram(), vector_normalizer='None') << 'SentimentText' nb = NaiveBayesClassifier(feature=['SentimentText']) ppl = Pipeline([texttransform, nb]) ppl.fit(X_train, y_train) scores = ppl.predict(X_test)['PredictedLabel'] # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
import pandas from nimbusml import Pipeline from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram # create the data customer_reviews = pandas.DataFrame(data=dict(review=[ "I really did not like the taste of it", "It was surprisingly quite good!", "I will never ever ever go to that place again!!", "The best ever!! It was amazingly good and super fast", "I wish I had gone earlier, it was that great", "somewhat dissapointing. I'd probably wont try again", "Never visit again... rascals!" ])) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='review_TransformedText'), WordEmbedding() << 'review_TransformedText' ]) y = pipeline.fit_transform(customer_reviews) # view a small subset of the review embeddings print(y.iloc[:5, -3:]) # review_TransformedText.147 review_TransformedText.148 review_TransformedText.149 # 0 1.918661 -0.714531 3.062141 # 1 1.891922 -0.248650 1.706620 # 2 1.601611 0.309785 3.379576 # 3 1.970666 1.477450 3.110802 # 4 2.521791 0.122538 3.129919
INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRanker': LightGbmRanker(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), 'IidSpikeDetector': IidSpikeDetector(columns=['F0']), 'IidChangePointDetector': IidChangePointDetector(columns=['F0']), 'SsaSpikeDetector': SsaSpikeDetector(columns=['F0'], seasonal_window_size=2), 'SsaChangePointDetector': SsaChangePointDetector(columns=['F0'], seasonal_window_size=2), 'SsaForecaster': SsaForecaster(columns=['F0'], window_size=2,
# Get schema from a fitted pipeline example. import numpy as np import pandas as pd from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram # data input (as a FileDataStream) path = get_dataset("wiki_detox_train").as_filepath() data = FileDataStream.read_csv(path, sep='\t') print(data.head()) # Sentiment SentimentText # 0 1 ==RUDE== Dude, you are rude upload that carl p... # 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1 Stop trolling, zapatancas, calling me a liar m... # 3 1 ==You're cool== You seem like a really cool g... # 4 1 ::::: Why are you threatening me? I'm not bein... pipe = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), columns={'features': ['SentimentText']}) ]) pipe.fit(data) schema = pipe.get_output_columns() print(schema[0:5]) # ['Sentiment', 'SentimentText', 'features.Char.<␂>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u']
# data input (as a FileDataStream) path = get_dataset('wiki_detox_train').as_filepath() data = FileDataStream.read_csv(path, sep='\t') print(data.head()) # Sentiment SentimentText # 0 1 ==RUDE== Dude, you are rude upload that carl p... # 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1 Stop trolling, zapatancas, calling me a liar m... # 3 1 ==You're cool== You seem like a really cool g... # 4 1 ::::: Why are you threatening me? I'm not bein... xf = NGramFeaturizer(word_feature_extractor=Ngram(), stop_words_remover=CustomStopWordsRemover(['!', '$', '%', '&', '\'', '\'d']), columns={'features': ['SentimentText']}) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # Sentiment ... features.douchiest features.award. # 0 1 ... 0.0 0.0 # 1 1 ... 0.0 0.0 # 2 1 ... 0.0 0.0 # 3 1 ... 0.0 0.0
'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, LpScaler(columns={'normed_columns': 'concated_columns'}) ]), 'MutualInformationSelector': Pipeline([ ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}), MutualInformationSelector( columns='Features', label='Label', slots_in_output=2) # only accept one column ]), 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(), char_feature_extractor=Ngram(), keep_diacritics=True, columns={ 'features': ['SentimentText']}), 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']), 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']), 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \ OneVsRestClassifier(AveragedPerceptronBinaryClassifier(), use_probabilities=True, feature=['age', 'education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs'], label='induced'), 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \ OneVsRestClassifier(LinearSvmBinaryClassifier(), use_probabilities=True, feature=['age',