def test_check_estimator_fromkey(self): text_df = pandas.DataFrame( data=dict( text=[ "cat", "dog", "fish", "orange", "cat orange", "dog", "fish", "spider"], num=[ 1, 2, 3, 4, 5, 6, 7, 8])) tokey = ToKey() << ['text'] data_idv = tokey.fit_transform(text_df) assert data_idv is not None assert len(data_idv) > 0 assert str(sorted([str(dt) for dt in data_idv.dtypes]) ) == "['category', 'int64']" fromkey = FromKey() << ['text'] data = fromkey.fit_transform(data_idv) assert str(list(data_idv['text'])) == str(list(data['text'])) t = numpy.unique(data_idv['text'].cat.codes) assert len(t) == 6 assert list(data_idv['text'].cat.categories) == [ "cat", "dog", "fish", "orange", "cat orange", "spider"]
def test_example_key_to_text_typeerror_u4(self): text_df = pandas.DataFrame(data=dict(text=[1, 2]), dtype=numpy.uint32) tokey = FromKey() << 'text' # System.ArgumentOutOfRangeException: 'Source column 'text' has invalid # type ('U8'): Expected Key type of known cardinality. assert_raise_message( RuntimeError, "", lambda: tokey.fit_transform(text_df))
############################################################################### # FromKey import pandas from nimbusml.preprocessing import FromKey, ToKey from pandas import Categorical # Create the data categorical_df = pandas.DataFrame(data=dict( key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']), text=['b', 'c', 'a', 'b', 'a', 'c'])) fromkey = FromKey(columns='key') y = fromkey.fit_transform(categorical_df) print(y) tokey = ToKey(columns='text') y = tokey.fit_transform(categorical_df) y2 = fromkey.clone().fit_transform(y) print(y2['text'] == categorical_df['text'])
############################################################################### # FromKey from nimbusml import FileDataStream, Pipeline from nimbusml.datasets import get_dataset from nimbusml.preprocessing import FromKey, ToKey # data input (as a FileDataStream) path = get_dataset('topics').as_filepath() # load data data = FileDataStream.read_csv(path, sep=',') # transform usage pipeline = Pipeline([ ToKey(columns=['review_reverse']), FromKey(columns=['review_reverse']) ]) # fit and transform output = pipeline.fit_transform(data) print(output.head()) # label review review_reverse # 0 1 animals birds cats dogs fish horse radiation galaxy universe duck # 1 0 horse birds house fish duck cats space galaxy universe radiation # 2 1 car truck driver bus pickup bus pickup # 3 0 car truck driver bus pickup horse car truck # 4 1 car truck car truck driver bus pickup horse
'Petal_Length', 'Petal_Width', 'Setosa']}), 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) ]), 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}), 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']), 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']), 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}), 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']),
data = FileDataStream(path, schema=file_schema) print(data.head()) # Sentiment SentimentText # 0 1.0 ==RUDE== Dude, you are rude upload that carl p... # 1 1.0 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1.0 Stop trolling, zapatancas, calling me a liar m... # 3 1.0 ==You're cool== You seem like a really cool g... # 4 1.0 ::::: Why are you threatening me? I'm not bein... # After using Character Tokenizer, it will convert the vector of Char to Key type. # Use FromKey to retrieve the data from Key first, then send into WordEmbedding. pipe = Pipeline([ CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}), WordEmbedding(model_kind='GloVe50D', columns={'Feature': 'SentimentText_FromKey'}), ColumnSelector(columns=['Sentiment', 'SentimentText', 'Feature']) ]) print(pipe.fit_transform(data).head()) # Sentiment ... Feature.149 # 0 1.0 ... 2.67440 # 1 1.0 ... 0.78858 # 2 1.0 ... 2.67440 # 3 1.0 ... 2.67440 # 4 1.0 ... 2.67440 # [5 rows x 152 columns]