Example #1
0
 def test_syntax5_failing(self):
     df, X, y = self.get_simple_df()
     vec = OneHotVectorizer() << {'edu1': ['education1']}
     try:
         vec.fit_transform(X, verbose=2)
         assert False
     except RuntimeError as e:
         assert "Returned code is -1. Check the log for error messages.." \
                in str(e)
     vec = OneHotVectorizer() << {'edu1': ['education']}
     res = vec.fit_transform(X)
     assert res.shape == (5, 5)
 def test_syntax5_failing(self):
     df, X, y = self.get_simple_df()
     vec = OneHotVectorizer() << {'edu1': ['education1']}
     try:
         vec.fit_transform(X, verbose=2)
         assert False
     except RuntimeError as e:
         assert "Error: *** System.ArgumentOutOfRangeException: 'Could not find input column" \
                in str(e)
     vec = OneHotVectorizer() << {'edu1': ['education']}
     res = vec.fit_transform(X)
     assert res.shape == (5, 5)
Example #3
0
    def test_combined_models_support_decision_function(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.decision_function(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.decision_function(data)

        self.assertTrue(np.array_equal(result_1, result_2))
Example #4
0
    def test_combine_with_classifier_trained_with_filedatastream(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(data)

        result_1 = result_1.astype(np.int32)
        result_2 = result_2['PredictedLabel'].astype(np.int32)
        self.assertTrue(result_1.equals(result_2))
Example #5
0
    def test_combine_with_classifier_trained_with_y_arg(self):
        """
        Tests a sequence where the initial transform is computed
        using both X and y input args. Note, any steps after the
        initial transform will be operating on data where the X
        and y have been combined in to one dataset.
        """
        np.random.seed(0)

        df = get_dataset("infert").as_df()

        X = df.loc[:, df.columns != 'case']
        y = df['case']

        transform = OneHotVectorizer() << 'education_str'

        # Passing in both X and y
        df = transform.fit_transform(X, y, as_binary_data_stream=True)

        # NOTE: need to specify the label column here because the
        # feature and label data was joined in the last step.
        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=list(X.columns))
        predictor.fit(df)

        df = transform.transform(X, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(X)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
Example #6
0
    def test_combine_transform_and_predictor(self):
        transform = OneHotVectorizer() << 'c0'
        df = transform.fit_transform(train_df, as_binary_data_stream=True)

        predictor = OnlineGradientDescentRegressor(label='c2',
                                                   feature=['c0', 'c1'])
        predictor.fit(df)

        df = transform.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(test_df)

        self.assertEqual(result_1[0], result_2.loc[0, 'Score'])
        self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
Example #7
0
    def test_fit_predictor_with_idv(self):
        train_data = {
            'c0': ['a', 'b', 'a', 'b'],
            'c1': [1, 2, 3, 4],
            'c2': [2, 3, 4, 5]
        }
        train_df = pd.DataFrame(train_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        test_data = {
            'c0': ['a', 'b', 'b'],
            'c1': [1.5, 2.3, 3.7],
            'c2': [2.2, 4.9, 2.7]
        }
        test_df = pd.DataFrame(test_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        # Fit a transform pipeline to the training data
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'])
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Fit a predictor pipeline given a transformed BinaryDataStream
        predictor = OnlineGradientDescentRegressor(label='c2',
                                                   feature=['c0', 'c1'])
        predictor_pipeline = Pipeline([predictor])
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Create expected result
        xf = OneHotVectorizer() << 'c0'
        df = xf.fit_transform(train_df)
        predictor = OnlineGradientDescentRegressor(
            label='c2', feature=['c0.a', 'c0.b', 'c1'])
        predictor.fit(df)
        df = xf.transform(test_df)
        expected_result = predictor.predict(df)

        self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))
    def test_fit_transform(self):
        # data input (as a FileDataStream)
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        # transform usage
        xf = OneHotVectorizer(
            columns={
                'edu': 'education',
                'in': 'induced',
                'sp': 'spontaneous'})

        # fit and transform
        res1 = xf.fit_transform(data)
        res2 = xf.fit(data).transform(data)
        assert_frame_equal(res1, res2)
Example #9
0
    def test_combine_with_classifier_trained_with_joined_X_and_y(self):
        np.random.seed(0)

        infert_df = get_dataset("infert").as_df()
        feature_cols = [c for c in infert_df.columns if c != 'case']

        transform = OneHotVectorizer() << 'education_str'
        df = transform.fit_transform(infert_df, as_binary_data_stream=True)

        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=feature_cols)
        predictor.fit(df)

        df = transform.transform(infert_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(infert_df)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
Example #10
0
 def test_syntax6_passing(self):
     df, X, y = self.get_simple_df()
     vec = OneHotVectorizer() << ['education', 'education2']
     res = vec.fit_transform(X)
     assert res.shape == (5, 5)
Example #11
0
    def test_syntax_slots_wo_pipeline(self):
        # data
        df = get_dataset("infert").as_df()
        df = df.drop(['row_num', ], axis=1)
        X = df.drop('case', axis=1)
        y = df['case']

        # transform
        xf1 = OneHotVectorizer(columns=['age', 'parity', 'education_str'])
        X_xf1 = xf1.fit_transform(X, verbose=0)
        assert "age.21" in list(X_xf1.columns)

        # learner
        # (1 .a.)
        model = AveragedPerceptronBinaryClassifier()

        # (1. b)
        try:
            model = AveragedPerceptronBinaryClassifier(feature=['age'])
            model.fit(X_xf1, y, verbose=0)
            cont = True
            assert False
        except Exception as e:
            # does not work
            cont = False
            print(e)

        if cont:
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)

        pipeline = Pipeline([
            OneHotVectorizer(columns=['age', 'parity', 'education_str']),
            AveragedPerceptronBinaryClassifier(feature='age')
        ])

        pipeline.fit(X, y, verbose=0)

        y_pred_withpipeline = pipeline.predict(X)
        print(y_pred_withpipeline.head())
        assert y_pred_withpipeline.shape == (248, 3)

        metrics, scores = pipeline.test(X, y, output_scores=True)
        print(metrics)
        assert scores.shape == (248, 3)
        assert metrics.shape == (1, 11)

        # back to X_xf1
        print(list(X_xf1.columns))
        l1 = list(sorted(set(_.split('.')[-1] for _ in X_xf1.columns)))
        levels = [['age', 'education', 'education_str', 'parity',
                   'pooled', 'spontaneous', 'stratum', 'induced'], [''] + l1]
        names = ['columns', 'slots']
        labels = [[], []]
        ages = []
        for _ in X_xf1.columns:
            spl = _.split('.')
            l1 = levels[0].index(spl[0])
            try:
                l2 = levels[1].index(spl[1])
            except IndexError:
                l2 = levels[1].index('')
            labels[0].append(l1)
            labels[1].append(l2)
            if spl[0] == 'age':
                ages.append(l2)
        X_xf1.columns = pandas.MultiIndex(
            levels=levels, labels=labels, names=names)
        print(X_xf1.head(n=2).T)

        col_ages = [('age', a) for a in ages]
        print(col_ages)
        try:
            model = AveragedPerceptronBinaryClassifier(feature=col_ages)
            model.fit(X_xf1, y, verbose=0)
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)
        except Exception as e:
            # Does not work, probably confusion between list and tuple in nimbusml
            print(e)

        try:
            model = AveragedPerceptronBinaryClassifier(feature=['age'])
            model.fit(X_xf1, y, verbose=0)
            y_pred = model.predict(X_xf1)
            assert y_pred.shape == (248, 3)
        except Exception as e:
            # Does not work.
            print(e)
Example #12
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path, sep=',',
                               dtype={'spontaneous': str
                                      })  # Error with numeric input for ohhv
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# transform usage
# set output_kind = "Bag" to featurize slots independently for vector type
# columns
xf = OneHotVectorizer(columns={
    'edu': 'education',
    'in': 'induced',
    'sp': 'spontaneous'
})

# fit and transform
features = xf.fit_transform(data)
print(features.head())

#    age  case  edu.0-5yrs  edu.12+ yrs  edu.6-11yrs education   in.0  in.1 ...
# 0   26     1         1.0          0.0          0.0    0-5yrs    0.0   1.0 ...
# 1   42     1         1.0          0.0          0.0    0-5yrs    0.0   1.0 ...
# 2   39     1         1.0          0.0          0.0    0-5yrs    0.0   0.0 ...