Esempio n. 1
0
    def test_with_or_without_pipeline(self):
        # Bug 227810
        # data input (as a FileDataStream)
        path = get_dataset('infert').as_filepath()

        file_schema = 'sep=, col=education:TX:1 col=Features:R4:2-4,6-8 ' \
                      'col=case:R4:5 header=+'
        data = FileDataStream(path, schema=file_schema)

        # without pipeline -- fails
        m = LogisticRegressionBinaryClassifier(feature=['Features'],
                                               label='case')
        m.fit(data)
        scores1 = m.predict(data)

        # with pipeline -- works
        m = Pipeline([
            LogisticRegressionBinaryClassifier(feature=['Features'],
                                               label='case')
        ])
        m.fit(data)
        scores2 = m.predict(data)
        diff = np.abs(scores1.values.ravel() -
                      scores2[['PredictedLabel']].values.ravel())
        assert diff.sum() <= 2
Esempio n. 2
0
    def test_combine_with_classifier_trained_with_filedatastream(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(data)

        result_1 = result_1.astype(np.int32)
        result_2 = result_2['PredictedLabel'].astype(np.int32)
        self.assertTrue(result_1.equals(result_2))
Esempio n. 3
0
    def test_combine_with_classifier_trained_with_y_arg(self):
        """
        Tests a sequence where the initial transform is computed
        using both X and y input args. Note, any steps after the
        initial transform will be operating on data where the X
        and y have been combined in to one dataset.
        """
        np.random.seed(0)

        df = get_dataset("infert").as_df()

        X = df.loc[:, df.columns != 'case']
        y = df['case']

        transform = OneHotVectorizer() << 'education_str'

        # Passing in both X and y
        df = transform.fit_transform(X, y, as_binary_data_stream=True)

        # NOTE: need to specify the label column here because the
        # feature and label data was joined in the last step.
        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=list(X.columns))
        predictor.fit(df)

        df = transform.transform(X, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(X)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
Esempio n. 4
0
    def test_ngramfeaturizer_syntax_dict(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'outg': ['review']
        }
        X = textt.fit_transform(X)

        assert X.shape == (25, 117)
        # columns ordering changed between 0.22 and 0.23
        assert 'review' in (X.columns[0], X.columns[-1])
        X = X.drop('review', axis=1)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        X_test = X_test.drop('review', axis=1)
        scores = mymodel.predict(X_test)

        # View the scores
        assert scores.shape == (10, )
Esempio n. 5
0
    def test_ngramfeaturizer(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
        X = textt.fit_transform(X)

        assert X.shape == (25, 116)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        scores = mymodel.predict(textt.transform(test_reviews))

        # View the scores
        assert scores.shape == (10, )
        assert X_test.shape[0] == 10
Esempio n. 6
0
    def test_combine_with_classifier_trained_with_joined_X_and_y(self):
        np.random.seed(0)

        infert_df = get_dataset("infert").as_df()
        feature_cols = [c for c in infert_df.columns if c != 'case']

        transform = OneHotVectorizer() << 'education_str'
        df = transform.fit_transform(infert_df, as_binary_data_stream=True)

        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=feature_cols)
        predictor.fit(df)

        df = transform.transform(infert_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(infert_df)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
Esempio n. 7
0
    data=dict(
        review=[
            "This is great",
            "I hate it",
            "Love it",
            "Really like it",
            "I hate it",
            "I like it a lot",
            "I love it",
            "I do like it",
            "I really hate it",
            "I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
X = ngram.fit_transform(X)

# view the transformed numerical values and column names
# print(X.head())

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = ngram.transform(test_reviews)

scores = mymodel.predict(ngram.transform(test_reviews))

# view the scores
# print(scores.head())
Esempio n. 8
0
                  True, False, True, False, True, False, True
              ]))

test_reviews = pandas.DataFrame(data=dict(review=[
    "This is great", "I hate it", "Love it", "Really like it", "I hate it",
    "I like it a lot", "I love it", "I do like it", "I really hate it",
    "I love it"
]))

# OneHotHashVectorizer transform: the entire string is treated as a category.
# if output column name is same as input column, original input column values
# are replaced. number_of_bits=6 will hash into 2^6 -1 dimensions

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

cat = OneHotHashVectorizer(number_of_bits=6) << 'review'
X = cat.fit_transform(X)

# view the transformed numerical values and column names
print(X)

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = cat.transform(test_reviews)

scores = mymodel.predict(cat.transform(test_reviews))

# view the scores
print(scores)
Esempio n. 9
0
    CharTokenizer(columns={'review_transform': 'review'}),
    NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
    ColumnDropper(columns=['review_transform', 'review'])
])
X = pipeline.fit_transform(X)

print(X.head())
#    ngrams.<␂>|T|h  ngrams.T|h|i  ngrams.h|i|s  ngrams.i|s|<␠>  ...  ngrams.i|t|!  ngrams.t|!|<␃>  ngrams.<␂>|H|a  ngrams.H|a|t
# 0             1.0           1.0           1.0             2.0  ...           0.0             0.0             0.0           0.0
# 1             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 2             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 3             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 4             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0

model = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = pipeline.transform(test_reviews)
result = model.predict(X_test)

print(result)
# 0     True
# 1    False
# 2     True
# 3     True
# 4    False
# 5     True
# 6     True
# 7     True
# 8    False
# 9     True
# LogisticRegressionBinaryClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionBinaryClassifier
from sklearn.model_selection import train_test_split

# use the built-in data set 'infert' to create test and train data
#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs
np.random.seed(0)

df = get_dataset("infert").as_df()

# remove : and ' ' from column names, and encode categorical column
df.columns = [i.replace(': ', '') for i in df.columns]
df = (OneHotVectorizer() << 'education_str').fit_transform(df)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'case'], df['case'])

lr = LogisticRegressionBinaryClassifier().fit(X_train, y_train)
scores = lr.predict(X_test)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))