Ejemplo n.º 1
0
    def test_schema__repr(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path,
                                       sep=',',
                                       numeric_dtype=numpy.float32)
        assert str(
            data.schema) == "col=row_num:R4:0 col=education:TX:1 " \
                            "col=age:R4:2 col=parity:R4:3 " \
                            "col=induced:R4:4 col=case:R4:5 " \
                            "col=spontaneous:R4:6 col=stratum:R4:7 " \
                            "col=pooled.stratum:R4:8 header=+ sep=,"
        assert "DataSchema([DataColumn(name='row_num', type='R4', " \
               "pos=0)" in str(repr(data.schema))

        path = get_dataset('topics').as_filepath()
        data = FileDataStream.read_csv(path,
                                       sep=',',
                                       numeric_dtype=numpy.float32,
                                       collapse=True)
        assert str(
            data.schema) == "col=review:TX:0-1 col=label:R4:2 header=+ " \
                            "sep=,"
        assert "DataSchema([DataColumn(name='review', type='TX', pos=(0," \
               " 1))" in str(repr(data.schema))

        path = get_dataset('topics').as_filepath()
        data = FileDataStream.read_csv(path,
                                       sep=',',
                                       numeric_dtype=numpy.float32,
                                       collapse=False)
        assert str(
            data.schema) == "col=review:TX:0 col=review_reverse:TX:1 " \
                            "col=label:R4:2 header=+ sep=,"
        assert "DataSchema([DataColumn(name='review', type='TX', pos=0)," \
               in str(repr(data.schema))
Ejemplo n.º 2
0
    def test_combine_with_classifier_trained_with_filedatastream(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(data)

        result_1 = result_1.astype(np.int32)
        result_2 = result_2['PredictedLabel'].astype(np.int32)
        self.assertTrue(result_1.equals(result_2))
Ejemplo n.º 3
0
    def test_combined_models_support_decision_function(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.decision_function(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.decision_function(data)

        self.assertTrue(np.array_equal(result_1, result_2))
Ejemplo n.º 4
0
    def test_schema_tab(self):
        train_file = get_dataset('topics').as_filepath()

        train_file_stream = FileDataStream.read_csv(train_file,
                                                    sep=',',
                                                    names={
                                                        0: 'review',
                                                        1: 'review_reverse',
                                                        2: 'label'
                                                    })
        with open(train_file, 'r', encoding='utf-8') as f:
            first_line = f.readline()
        header = first_line.strip(' \n\r').split(',')

        assert header == ['review', 'review_reverse', 'label']
        print(str(train_file_stream.schema))
        assert str(
            train_file_stream.schema) == 'col=review:TX:0 ' \
                                         'col=review_reverse:TX:1 ' \
                                         'col=label:I8:2 header=+ sep=,'

        train_file_stream = FileDataStream.read_csv(
            train_file,
            sep=',',
            names={
                0: 'review',
                1: 'review_reverse',
                2: 'label'
            },
            dtype={'label': numpy.uint32})
        assert str(
            train_file_stream.schema) == 'col=review:TX:0 ' \
                                         'col=review_reverse:TX:1 ' \
                                         'col=label:U4:2 header=+ sep=,'
    def setUpClass(self):
        adult_path = get_dataset('uciadult_train').as_filepath()
        self.classification_data = FileDataStream.read_csv(adult_path)
        binary_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            LogisticRegressionBinaryClassifier(feature=['age', 'education'],
                                               label='label',
                                               number_of_threads=1)
        ])
        self.binary_model = binary_pipeline.fit(self.classification_data)
        self.binary_pfi = self.binary_model.permutation_feature_importance(
            self.classification_data)
        classifier_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearClassifier(feature=['age', 'education'],
                                 label='label',
                                 number_of_threads=1,
                                 shuffle=False)
        ])
        self.classifier_model = classifier_pipeline.fit(
            self.classification_data)
        self.classifier_pfi = self.classifier_model.permutation_feature_importance(
            self.classification_data)

        infert_path = get_dataset('infert').as_filepath()
        self.regression_data = FileDataStream.read_csv(infert_path)
        regressor_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearRegressor(feature=['induced', 'education'],
                                label='age',
                                number_of_threads=1,
                                shuffle=False)
        ])
        self.regressor_model = regressor_pipeline.fit(self.regression_data)
        self.regressor_pfi = self.regressor_model.permutation_feature_importance(
            self.regression_data)

        ticket_path = get_dataset('gen_tickettrain').as_filepath()
        self.ranking_data = FileDataStream.read_csv(ticket_path)
        ranker_pipeline = Pipeline([
            ToKey(columns=['group']),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group',
                           random_state=0,
                           number_of_threads=1)
        ])
        self.ranker_model = ranker_pipeline.fit(self.ranking_data)
        self.ranker_pfi = self.ranker_model.permutation_feature_importance(
            self.ranking_data)
Ejemplo n.º 6
0
    def test_lightgbmranker_asfilestream(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        # Pure-nimbusml paradigm
        train_stream = FileDataStream.read_csv(file_path, encoding='utf-8')

        # pipeline
        pipeline = Pipeline([
            # the group_id column must be of key type
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        # train
        pipeline.fit(train_stream)

        # test
        eval_stream = FileDataStream.read_csv(file_path)
        metrics, _ = pipeline.test(eval_stream)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
Ejemplo n.º 7
0
    def test_combined_models_support_predict_proba_with_more_than_2_classes(
            self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(data)

        feature_cols = ['education', 'age']
        training_pipeline = Pipeline([
            DatasetTransformer(featurization_pipeline.model),
            OneVsRestClassifier(LogisticRegressionBinaryClassifier(),
                                feature=feature_cols,
                                label='induced')
        ])
        training_pipeline.fit(data, output_predictor_model=True)

        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator({'education': 'education.'})])
        concat_pipeline.fit(featurized_data)

        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(training_pipeline.predictor_model)

        concat_and_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)

        result = concat_and_predictor_pipeline.predict_proba(featurized_data)
        self.assertEqual(result.shape[1], 3)
Ejemplo n.º 8
0
    def test_data_stream(self):
        df = pandas.DataFrame(dict(a=[0, 1], b=[0.1, 0.2]))
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
            df.to_csv(f, sep=',', index=False)

        fi = FileDataStream.read_csv(f.name, sep=',')
        fi2 = fi.clone()
        assert repr(fi) == repr(fi2)
        os.remove(f.name)
Ejemplo n.º 9
0
 def test_defaults(self):
     schema = DataSchema.read_schema(infert_file, numeric_dtype=np.float32)
     data = FileDataStream.read_csv(infert_file, schema=schema)
     pipeline_steps = [
         OneHotVectorizer(columns={'edu': 'education'}),
         KMeansPlusPlus(
             n_clusters=5,
             feature=['edu', 'age', 'parity', 'spontaneous', 'stratum'])
     ]
     check_cv(pipeline_steps, data)
Ejemplo n.º 10
0
 def test_schema_airquality(self):
     train_file = get_dataset("airquality").as_filepath()
     found = DataSchema.read_schema(train_file)
     schema = "col=Unnamed0:I8:0 col=Ozone:R8:1 col=Solar_R:R8:2 " \
              "col=Wind:R8:3 col=Temp:I8:4 col=Month:I8:5 " \
              "col=Day:I8:6 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file)
     assert str(fds.schema) == schema
Ejemplo n.º 11
0
 def test_schema_infert_R4(self):
     train_file = get_dataset("infert").as_filepath()
     found = DataSchema.read_schema(train_file, numeric_dtype=numpy.float32)
     schema = "col=row_num:R4:0 col=education:TX:1 col=age:R4:2 " \
              "col=parity:R4:3 col=induced:R4:4 " + \
              "col=case:R4:5 col=spontaneous:R4:6 col=stratum:R4:7 " \
              "col=pooled.stratum:R4:8 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file, numeric_dtype=numpy.float32)
     assert str(fds.schema) == schema
Ejemplo n.º 12
0
 def test_schema_infert(self):
     train_file = get_dataset("infert").as_filepath()
     found = DataSchema.read_schema(train_file)
     schema = "col=row_num:I8:0 col=education:TX:1 col=age:I8:2 " \
              "col=parity:I8:3 col=induced:I8:4 " + \
              "col=case:I8:5 col=spontaneous:I8:6 col=stratum:I8:7 " \
              "col=pooled.stratum:I8:8 header=+"
     assert str(found) == schema
     fds = FileDataStream(train_file, schema)
     assert str(fds.schema) == schema
     fds = FileDataStream.read_csv(train_file)
     assert str(fds.schema) == schema
Ejemplo n.º 13
0
    def test_multiple_user_specified_columns_is_not_allowed(self):
        path = get_dataset('timeseries').as_filepath()
        data = FileDataStream.read_csv(path)

        try:
            pipeline = Pipeline([
                IidSpikeDetector(columns=['t2', 't3'], pvalue_history_length=5)
            ])
            pipeline.fit_transform(data)

        except RuntimeError as e:
            self.assertTrue('Only one column is allowed' in str(e))
            return

        self.fail()
Ejemplo n.º 14
0
    def test_different_schema_with_filedatastream_input(self):
        train_filename = "train-data.csv"
        train_df.to_csv(train_filename, index=False, header=True)
        train_data_stream = FileDataStream.read_csv(train_filename, sep=',', header=True)

        test_filename = "test-data.csv"
        test_df.to_csv(test_filename, index=False, header=True)
        test_data_stream = FileDataStream.read_csv(test_filename, sep=',', header=True)

        # Create reference pipeline
        std_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_data_stream)
        result_1 = std_pipeline.predict(test_data_stream)

        # Create combined pipeline
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline.fit(train_data_stream)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_data_stream)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_data_stream)

        self.assertTrue(result_1.equals(result_2))

        os.remove(train_filename)
        os.remove(test_filename)
Ejemplo n.º 15
0
    def test_schema_with_vectorized_column(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(
            data, as_binary_data_stream=True)

        # col=row_num:I8:0 col=education:R4:1-3 col=age:I8:4 col=parity:I8:5
        # col=induced:I8:6 col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9
        # col=pooled.stratum:I8:10 quote+
        schema = featurized_data.schema

        self.assertEqual(len(schema), 9)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertEqual(schema['education'].Type, 'R4')
        self.assertEqual(schema['education'].Name, 'education')
        self.assertEqual(len(schema['education'].Pos), 3)
        self.assertEqual(schema['education'].IsVector, True)

        self.assertTrue('education.0-5yrs' not in schema)
        self.assertTrue('education.6-11yrs' not in schema)
        self.assertTrue('education.12+yrs' not in schema)

        # col=row_num:I8:0 col=education.0-5yrs:R4:1 col=education.6-11yrs:R4:2
        # col=education.12+yrs:R4:3 col=age:I8:4 col=parity:I8:5 col=induced:I8:6
        # col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 col=pooled.stratum:I8:10
        # quote+ header=+
        schema = featurized_data.get_dataframe_schema()

        self.assertEqual(len(schema), 11)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertTrue('education' not in schema)
        self.assertTrue('education.0-5yrs' in schema)
        self.assertTrue('education.6-11yrs' in schema)
        self.assertTrue('education.12+yrs' in schema)

        self.assertEqual(schema['education.0-5yrs'].Type, 'R4')
        self.assertEqual(schema['education.0-5yrs'].Name, 'education.0-5yrs')
        self.assertEqual(schema['education.0-5yrs'].IsVector, False)
Ejemplo n.º 16
0
    def test_fit_transform(self):
        import azureml.dataprep as dprep

        path = get_dataset('infert').as_filepath()
        dflow = dprep.auto_read_file(path=path)
        dprep_data = DprepDataStream(dflow)
        file_data = FileDataStream.read_csv(path)

        xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'})
        pipe = Pipeline([xf])
        transformed_data = pipe.fit_transform(file_data)
        transformed_data1 = pipe.fit_transform(dprep_data)

        assert_array_equal(transformed_data.columns, transformed_data1.columns)
        assert_2d_array_equal(transformed_data.values,
                              transformed_data1.values)
Ejemplo n.º 17
0
    def test_fit_transform(self):
        # data input (as a FileDataStream)
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        # transform usage
        xf = OneHotVectorizer(
            columns={
                'edu': 'education',
                'in': 'induced',
                'sp': 'spontaneous'})

        # fit and transform
        res1 = xf.fit_transform(data)
        res2 = xf.fit(data).transform(data)
        assert_frame_equal(res1, res2)
Ejemplo n.º 18
0
    def test_multiple_user_specified_columns_is_not_allowed(self):
        path = get_dataset('timeseries').as_filepath()
        data = FileDataStream.read_csv(path)

        try:
            pipeline = Pipeline([
                SsaForecaster(series_length=8,
                              train_size=15,
                              window_size=5,
                              horizon=2,
                              columns=['t2', 't3'])
            ])
            pipeline.fit_transform(data)

        except RuntimeError as e:
            self.assertTrue('Only one column is allowed' in str(e))
            return

        self.fail()
Ejemplo n.º 19
0
    def test_numeric_columns(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path, sep=',',
                                       numeric_dtype=np.float32)

        xf = OneHotHashVectorizer(
            columns={
                'edu': 'education',
                'in': 'induced',
                'sp': 'spontaneous'},
            number_of_bits=2)
        xf.fit_transform(data)

        xf = OneHotHashVectorizer(
            columns=[
                'education',
                'induced',
                'spontaneous'],
            number_of_bits=2)
        xf.fit_transform(data)
Ejemplo n.º 20
0
    def test_data_stream_head_file(self):
        df = pandas.DataFrame(dict(a=[0, 1], b=[0.1, 0.2]))
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
            df.to_csv(f, sep=',', index=False)

        df1 = df.head(1)
        df2 = df[1:].reset_index(drop=True)

        fi = FileDataStream.read_csv(f.name, sep=',')
        head = fi.head(1)
        head2 = fi.head(1, 1)
        assert_frame_equal(head, df1)
        assert_frame_equal(head2, df2)
        head3 = fi.head(1, 1, collect=False).transform(fi, verbose=0)
        assert_frame_equal(head3, df2)

        dff = fi.to_df()
        assert_frame_equal(df, dff)

        os.remove(f.name)
Ejemplo n.º 21
0
 def test_schema_sep_default(self):
     data = pandas.DataFrame(
         dict(real=[0.1, 2.2], text=['word', 'class'], y=[1, 3]))
     data.to_csv('data.csv', index=False, header=True)
     ds = FileDataStream.read_csv('data.csv',
                                  collapse=False,
                                  numeric_dtype=numpy.float32)
     assert str(
         ds.schema) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 header=+"
     assert ds.schema.to_string() == "col=real:R4:0 col=text:TX:1 " \
                                     "col=y:R4:2 header=+"
     assert ds.schema.to_string(
         add_sep=True) == "col=real:R4:0 col=text:TX:1 col=y:R4:2 " \
                          "header=+ sep=,"
     exp = Pipeline([
         OneHotVectorizer(columns=['text']),
         LightGbmRegressor(min_data_per_leaf=1)
     ])
     exp.fit(ds, 'y')
     pred = exp.predict(ds)
     assert pred is not None
     assert len(pred) > 0
Ejemplo n.º 22
0
 def test_model_summary_not_supported_specific(self):
     path = get_dataset('infert').as_filepath()
     data = FileDataStream.read_csv(path,
                                    sep=',',
                                    names={
                                        0: 'row_num',
                                        5: 'case'
                                    })
     pipeline = Pipeline([
         OneHotVectorizer(columns={'edu': 'education'}),
         FactorizationMachineBinaryClassifier(
             feature=['induced', 'edu', 'parity'], label='case')
     ])
     pipeline.fit(data)
     try:
         pipeline.summary()
     except TypeError as e:
         self.assertEqual(
             e.args[0],
             "One or more predictors in this pipeline do not support the .summary() function."
         )
     else:
         assert False
Ejemplo n.º 23
0
 def test_metrics_evaluate_binary_from_filedatastream(self):
     path = get_dataset('infert').as_filepath()
     data = FileDataStream.read_csv(path)
     e = Pipeline([
         OneHotVectorizer(columns={'edu': 'education'}),
         LightGbmRegressor(feature=['induced', 'edu'],
                           label='age',
                           number_of_threads=1)
     ])
     e.fit(data, verbose=0)
     metrics, _ = e.test(data)
     # TODO: debug flucations, and increase decimal precision on checks
     assert_almost_equal(metrics['L1(avg)'][0],
                         4.104164,
                         decimal=4,
                         err_msg="L1 loss should be %s" % 4.104164)
     assert_almost_equal(metrics['L2(avg)'][0],
                         24.15286,
                         decimal=4,
                         err_msg="L2(avg) should be %s" % 24.15286)
     assert_almost_equal(metrics['Loss-fn(avg)'][0],
                         24.15286,
                         decimal=4,
                         err_msg="Loss-fn(avg)loss should be %s" % 24.15286)
Ejemplo n.º 24
0
    def test_fit_transform_with_idv(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(
            data, as_binary_data_stream=True)

        schema = featurized_data.schema
        num_columns = len(schema)
        self.assertTrue('case' in schema)
        self.assertTrue('row_num' in schema)

        pipeline = Pipeline([ColumnDropper() << ['case', 'row_num']])
        pipeline.fit(featurized_data)
        result = pipeline.transform(featurized_data,
                                    as_binary_data_stream=True)

        schema = result.schema
        self.assertEqual(len(schema), num_columns - 2)
        self.assertTrue('case' not in schema)
        self.assertTrue('row_num' not in schema)
Ejemplo n.º 25
0
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.linear_model import AveragedPerceptronBinaryClassifier
from nimbusml.multiclass import OneVsRestClassifier
from nimbusml.preprocessing import DatasetTransformer
from data_frame_tool import DataFrameTool as DFT


def get_tmp_file(suffix=None):
    fd, file_name = tempfile.mkstemp(suffix=suffix)
    fl = os.fdopen(fd, 'w')
    fl.close()
    return file_name

path = get_dataset("wiki_detox_train").as_filepath()
train_set = FileDataStream.read_csv(path, sep='\t')
path = get_dataset("wiki_detox_test").as_filepath()
test_set = FileDataStream.read_csv(path, sep='\t')

class TestOnnxRuntime(unittest.TestCase):
    """
    Tests automl use case:
        1. Fit featurization pipeline separately.
        2. Fit learner on top of the featurization pipeline.
        3. Export to ONNX the learner pipeline.
        4. Compare results between ML.NET and ORT
    """

    @unittest.skipIf(six.PY2, "Disabled as there is no onnxruntime package for Python 2.7")
    def test_automl_usecase(self):
        # train featurization pipeline
Ejemplo n.º 26
0
###############################################################################
# GamBinaryClassifier
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import GamBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotVectorizer

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path)
print(data.head())
#   age  case education  induced  parity  ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    GamBinaryClassifier(feature=['age', 'edu'], label='case')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel     Score
Ejemplo n.º 27
0
###############################################################################
# ToKey
import numpy
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import ToKey

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32,
                               names={0: 'id'})
print(data.head())
#    age  case education   id  induced  parity  pooled.stratum  spontaneous ...
# 0  26.0   1.0    0-5yrs  1.0      1.0     6.0             3.0         2.0 ...
# 1  42.0   1.0    0-5yrs  2.0      1.0     1.0             1.0         0.0 ...
# 2  39.0   1.0    0-5yrs  3.0      2.0     6.0             4.0         0.0 ...
# 3  34.0   1.0    0-5yrs  4.0      2.0     4.0             2.0         0.0  ..
# 4  35.0   1.0   6-11yrs  5.0      1.0     3.0            32.0         1.0  ..

# transform usage
xf = ToKey(columns={'id_1': 'id', 'edu_1': 'education'})

# fit and transform
features = xf.fit_transform(data)
print(features.head())
#    age  case    edu_1 education   id  id_1  induced  parity  ...
# 0  26.0   1.0   0-5yrs    0-5yrs  1.0     0      1.0     6.0 ...
# 1  42.0   1.0   0-5yrs    0-5yrs  2.0     1      1.0     1.0 ...
# 2  39.0   1.0   0-5yrs    0-5yrs  3.0     2      2.0     6.0 ...
# 3  34.0   1.0   0-5yrs    0-5yrs  4.0     3      2.0     4.0 ...
Ejemplo n.º 28
0
# --------------------------------------------------------------------------------------------

import unittest

import numpy as np
import pandas as pd
from nimbusml import Pipeline, FileDataStream, BinaryDataStream
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import FastLinearRegressor
from nimbusml.preprocessing.normalization import MinMaxScaler
from sklearn.utils.testing import assert_true, assert_array_equal

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(
    path,
    sep=',',
    numeric_dtype=np.float32)  # Error with integer input

def is_nan(x):
    return (x is np.nan or x != x)

def assert_2d_array_equal(actual, desired):
    if len(actual) != len(desired):
        assert_true(False, "arrays are of different lengths.")

    for i in range(len(actual)):
        if len(actual[i]) != len(desired[i]):
            assert_true(False, "arrays are of different lengths.")
        for y in range(len(actual[i])):
            if is_nan(actual[i][y]) and is_nan(desired[i][y]):
                continue
Ejemplo n.º 29
0
###############################################################################
# RangeFilter
import numpy as np
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.filter import RangeFilter

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path, numeric_dtype=np.float32)
print(data.head())
#    age  case education  induced  parity  pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs      1.0     6.0             3.0      1.0  ...
# 1  42.0   1.0    0-5yrs      1.0     1.0             1.0      2.0  ...
# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...

# transform usage
xf = RangeFilter(min=20, max=30, columns='age')

# fit and transform, rows with age outside the range will be deleted
features = xf.fit_transform(data)
print(features.head())
#    age  case education  id  induced  parity  pooled.stratum  ...
# 0  26.0     1    0-5yrs   1        1       6               3 ...
# 1  23.0     1   6-11yrs   7        0       1               6 ...
# 2  21.0     1   6-11yrs   9        0       1               5 ...
# 3  28.0     1   6-11yrs  10        0       2              19 ...
# 4  29.0     1   6-11yrs  11        1       2              20 ...
Ejemplo n.º 30
0
###############################################################################
# ColumnConcatenator
import numpy
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.schema import ColumnConcatenator

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32)
print(data.head())
#    age  case education  induced  parity  pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs      1.0     6.0             3.0      1.0  ...
# 1  42.0   1.0    0-5yrs      1.0     1.0             1.0      2.0  ...
# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...

# transform usage
xf = ColumnConcatenator(columns={'features': ['age', 'parity', 'induced']})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
# Feature is a vectory type column, when a dataset with vectortype column is
# the final output, the vector column will convert into multiple columns for
# each slot.
#    age  case education  features.age  features.induced  features.parity  ...