def test_NGramFeaturizer_glove(self): # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) data = pd.DataFrame({ 'review': [ 'I like this movie', 'I don\'t like this', 'It is nice', 'I like this movie', 'I don\'t like this', 'It is nice', 'So boring' ], 'sentiment': ['pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg'] }) pipeline = Pipeline([ ('ng', NGramFeaturizer( word_feature_extractor=Ngram(), output_tokens_column_name='review_TransformedText', columns='review')), WordEmbedding(columns='review_TransformedText', model_kind='GloVe50D'), ('lr', FastLinearBinaryClassifier( feature=['review', 'review_TransformedText'], number_of_threads=1, shuffle=False)) ]) param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20]) grid = GridSearchCV(pipeline, param_grid) grid.fit(data['review'], 1 * (data['sentiment'] == 'pos')) assert grid.best_params_['lr__maximum_number_of_iterations'] == 100
def test_word_embedding_example2(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True, columns={'features': ['id', 'education']}), WordEmbedding(columns='features_TransformedText') ]) features = pipeline.fit_transform(data) assert features.shape == (248, 802) assert 'features_TransformedText.94' in list(features.columns)
def test_word_embedding_example_dict_newname(self): path = get_dataset('infert').as_filepath() file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \ 'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \ 'col=spontaneous:R4:6 quote+ header=+' data = FileDataStream(path, schema=file_schema) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='features_TransformedText', columns={'features': ['id', 'education']}), # What is features_TransformedText? WordEmbedding( columns={ 'features_TransformedText2': 'features_TransformedText'}) ]) features = pipeline.fit_transform(data) assert features.shape == (248, 409)
from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.naive_bayes import NaiveBayesClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import train_test_split # use 'wiki_detox_train' data set to create test and train data # Sentiment SentimentText # 1 ==RUDE== Dude, you are rude upload that carl picture back, or else. # 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split(train, label) # map text reviews to vector space texttransform = NGramFeaturizer(word_feature_extractor=Ngram(), vector_normalizer='None') << 'SentimentText' nb = NaiveBayesClassifier(feature=['SentimentText']) ppl = Pipeline([texttransform, nb]) ppl.fit(X_train, y_train) scores = ppl.predict(X_test)['PredictedLabel'] # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
import pandas from nimbusml import Pipeline from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram # create the data customer_reviews = pandas.DataFrame(data=dict(review=[ "I really did not like the taste of it", "It was surprisingly quite good!", "I will never ever ever go to that place again!!", "The best ever!! It was amazingly good and super fast", "I wish I had gone earlier, it was that great", "somewhat dissapointing. I'd probably wont try again", "Never visit again... rascals!" ])) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='review_TransformedText'), WordEmbedding() << 'review_TransformedText' ]) y = pipeline.fit_transform(customer_reviews) # view a small subset of the review embeddings print(y.iloc[:5, -3:]) # review_TransformedText.147 review_TransformedText.148 review_TransformedText.149 # 0 1.918661 -0.714531 3.062141 # 1 1.891922 -0.248650 1.706620 # 2 1.601611 0.309785 3.379576 # 3 1.970666 1.477450 3.110802 # 4 2.521791 0.122538 3.129919
ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, LpScaler(columns={'normed_columns': 'concated_columns'}) ]), 'MutualInformationSelector': Pipeline([ ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}), MutualInformationSelector( columns='Features', label='Label', slots_in_output=2) # only accept one column ]), 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(), char_feature_extractor=Ngram(), keep_diacritics=True, columns={ 'features': ['SentimentText']}), 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']), 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']), 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \ OneVsRestClassifier(AveragedPerceptronBinaryClassifier(), use_probabilities=True, feature=['age', 'education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs'], label='induced'), 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \ OneVsRestClassifier(LinearSvmBinaryClassifier(),
from nimbusml.feature_extraction.text.extractor import Ngram # data input (as a FileDataStream) path = get_dataset('wiki_detox_train').as_filepath() data = FileDataStream.read_csv(path, sep='\t') print(data.head()) # Sentiment SentimentText # 0 1 ==RUDE== Dude, you are rude upload that carl p... # 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1 Stop trolling, zapatancas, calling me a liar m... # 3 1 ==You're cool== You seem like a really cool g... # 4 1 ::::: Why are you threatening me? I'm not bein... # transform usage pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens_column_name='ngram_TransformedText', columns={'ngram': ['SentimentText']}), WordEmbedding(columns='ngram_TransformedText') ]) # fit and transform features = pipeline.fit_transform(data) # print features print(features.head()) # Sentiment ... ngram.douchiest ngram.award. # 0 1 ... 0.0 0.0 # 1 1 ... 0.0 0.0 # 2 1 ... 0.0 0.0 # 3 1 ... 0.0 0.0
data=dict( review=[ "This is great", "I hate it", "Love it", "Really like it", "I hate it", "I like it a lot", "I love it", "I do like it", "I really hate it", "I love it"])) y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] ngram = NGramFeaturizer(word_feature_extractor=Ngram()) << 'review' X = ngram.fit_transform(X) # view the transformed numerical values and column names # print(X.head()) mymodel = LogisticRegressionBinaryClassifier().fit(X, y) X_test = ngram.transform(test_reviews) scores = mymodel.predict(ngram.transform(test_reviews)) # view the scores # print(scores.head())
from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.naive_bayes import NaiveBayesClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import train_test_split # use 'wiki_detox_train' data set to create test and train data # Sentiment SentimentText # 1 ==RUDE== Dude, you are rude upload that carl picture back, or else. # 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split(train, label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=Ngram(), vector_normalizer='None') << 'SentimentText' nb = NaiveBayesClassifier(feature=['SentimentText']) ppl = Pipeline([texttransform, nb]) ppl.fit(X_train, y_train) # evaluate the model metrics, scores = ppl.test(X_test, y_test, output_scores=True) print(metrics)
############################################################################### # WordEmbedding: pre-trained transform to generate word embeddings import pandas from nimbusml import Pipeline from nimbusml.feature_extraction.text import WordEmbedding from nimbusml.feature_extraction.text.ngramfeaturizer import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram # create the data customer_reviews = pandas.DataFrame(data=dict(review=[ "I really did not like the taste of it", "It was surprisingly quite good!", "I will never ever ever go to that place again!!", "The best ever!! It was amazingly good and super fast", "I wish I had gone earlier, it was that great", "somewhat dissapointing. I'd probably wont try again", "Never visit again... rascals!" ])) pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), output_tokens=True), WordEmbedding() << 'review_TransformedText' ]) y = pipeline.fit_transform(customer_reviews) # view the review embeddings print(y)
############################################################################### # LightLda: cluster topics import pandas from nimbusml import Pipeline from nimbusml.feature_extraction.text import LightLda, NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram # create the data topics = pandas.DataFrame(data=dict(review=[ "animals birds cats dogs fish horse", "horse birds house fish duck cats", "car truck driver bus pickup", "car truck driver bus pickup horse ", "car truck", "bus pickup", "space galaxy universe radiation", "radiation galaxy universe duck" ])) # there are three main topics in our data. set num_topic=3 # and see if LightLDA vectors for topics look similar pipeline = Pipeline([ NGramFeaturizer(word_feature_extractor=Ngram(), vector_normalizer='None') << 'review', LightLda(num_topic=3) ]) y = pipeline.fit_transform(topics) # view the LDA topic vectors print(y)
data=dict(review=[ "This is great", "I hate it", "Love it", "Do not like it", "Really like it", "I hate it", "I like it a lot", "I kind of hate it", "I do like it", "I really hate it", "It is very good", "I hate it a bunch", "I love it a bunch", "I hate it", "I like it very much", "I hate it very much.", "I really do love it", "I really do hate it", "Love it!", "Hate it!", "I love it", "I hate it", "I love it", "I hate it", "I love it" ], like=[ True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True ])) X = train_reviews.loc[:, train_reviews.columns != 'like'] y = train_reviews['like'] # pipeline of transforms transform_1 = NGramFeaturizer(word_feature_extractor=Ngram()) transform_2 = MutualInformationSelector(slots_in_output=2) pipeline = Pipeline([transform_1, transform_2]) print(pipeline.fit_transform(X, y)) # Scikit compatibility (Compose transforms inside Scikit Pipeline). # In this scenario, we do not provide {input, output} arguments transform_1 = NGramFeaturizer(word_feature_extractor=Ngram()) transform_2 = MutualInformationSelector(slots_in_output=2) pipe = Pipeline([('text', transform_1), ('featureselect', transform_2)]) print(pipe.fit_transform(X, y))