Ejemplos de ColumnConcatenator en Python, ejemplos de nimbusml.preprocessing.schema.ColumnConcatenator en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_csr_matrix_output.py Proyecto: yazici/NimbusML

    def test_vector_column_combined_with_single_value_columns(self):
        train_data = {
            'c1': [1, 0, 0, 4],
            'c2': [2, 3, 0, 5],
            'c3': [3, 4, 5, 6]
        }
        train_df = pd.DataFrame(train_data).astype(np.float32)

        xf = ColumnConcatenator(columns={'features': ['c1', 'c2', 'c3']})
        xf.fit(train_df)
        result = xf.transform(train_df, as_csr=True)

        self.assertEqual(result.nnz, 18)
        self.assertTrue(type(result) == csr_matrix)
        result = pd.DataFrame(result.todense())

        train_data = {
            0: [1, 0, 0, 4],
            1: [2, 3, 0, 5],
            2: [3, 4, 5, 6],
            3: [1, 0, 0, 4],
            4: [2, 3, 0, 5],
            5: [3, 4, 5, 6]
        }
        expected_result = pd.DataFrame(train_data).astype(np.float32)
        self.assertTrue(result.equals(expected_result))

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_fasttreesbinaryclassifier.py Proyecto: yazici/NimbusML

    def test_default_label(self):
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]

        # 1
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Label: 'Label',
                Role.Feature: 'Features'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities0 = model.predict_proba(df)

        # 2
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Feature: 'Features'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)

        # 3
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2)
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)

        # 4
        pipeline = Pipeline([
            ColumnConcatenator() << {
                'Features': ["Petal_Length", "Sepal_Length"]
            },
            FastTreesBinaryClassifier(number_of_trees=2) << {
                Role.Label: 'Label'
            }
        ])

        model = pipeline.fit(df, verbose=0)
        probabilities = model.predict_proba(df)
        assert_array_almost_equal(probabilities0, probabilities)

Ejemplo n.º 3

0

Mostrar archivo

 def test_columns_concatenator(self):
     path = get_dataset('infert').as_filepath()
     file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                   'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                   'col=spontaneous:R4:6 header=+'
     data = FileDataStream(path, schema=file_schema)
     xf = ColumnConcatenator(
         columns={'features': ['age', 'parity', 'induced']})
     features = xf.fit_transform(data)
     assert features.shape == (248, 10)
     # columns ordering changed between 0.22 and 0.23
     assert set(features.columns) == {
         'age', 'case', 'education', 'features.age', 'features.induced',
         'features.parity', 'id', 'induced', 'parity', 'spontaneous'
     }

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_pcatransformer.py Proyecto: zyw400/NimbusML-1

 def test_PcaTransformer_int(self):
     df_ = get_dataset("infert").as_df()
     res = {}
     dt = {}
     for ty in (int, float):
         df = df_.copy()
         df['age'] = df['age'].astype(ty)
         df['parity'] = df['parity'].astype(ty)
         df['spontaneous'] = df['spontaneous'].astype(ty)
         df['stratum'] = df['stratum'].astype(ty)
         X = ['age', 'parity', 'spontaneous', 'stratum']
         pipe = Pipeline([
             ColumnConcatenator() << {
                 'X': X
             },
             PcaTransformer(rank=3) << 'X'
         ])
         y = pipe.fit_transform(df[X], verbose=0)
         res[ty] = y.sum().sum()
         dt[ty] = list(y.dtypes)
     vals = list(res.values())
     assert_almost_equal(vals[0], vals[1])
     dt = list(dt.values())
     dt[0].sort()
     dt[1].sort()
     assert dt[0] != dt[1]

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_lpscaler.py Proyecto: yazici/NimbusML

    def test_lpscaler_automatically_converts_to_single(self):
        in_df = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[0, 2.5, 2.6, 2.4],
                      Species=["setosa", "viginica", "setosa", 'versicolor']))

        in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float64)

        src_cols = ['Sepal_Length', 'Sepal_Width', 'Petal_Length']

        pipeline = Pipeline([
            ColumnConcatenator() << {
                'concat': src_cols
            },
            LpScaler() << {
                'norm': 'concat'
            }
        ])
        out_df = pipeline.fit_transform(in_df)

        cols = ['concat.' + s for s in src_cols]
        cols.extend(['norm.' + s for s in src_cols])
        sum = out_df[cols].sum().sum()
        sum_range = (23.24, 23.25)
        assert_greater(sum, sum_range[0],
                       "sum should be greater than %s" % sum_range[0])
        assert_less(sum, sum_range[1],
                    "sum should be less than %s" % sum_range[1])

Ejemplo n.º 6

0

Mostrar archivo

    def test_globalcontrastrowscaler(self):
        in_df = pd.DataFrame(
            data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0],
                      Sepal_Width=[.75, .9, .8, .76],
                      Petal_Length=[0, 2.5, 2.6, 2.4],
                      Species=["setosa", "viginica", "setosa", 'versicolor']))

        in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float32)

        # generate two new Columns - Petal_Normed and Sepal_Normed
        concat = ColumnConcatenator() << {
            'concated_columns':
            ['Petal_Length', 'Sepal_Width', 'Sepal_Length']
        }

        # Performs a global contrast normalization on input values:
        # Y = (s * X - M) / D, where s is a scale, M is mean and D is either
        # L2 norm or standard deviation
        normed = GlobalContrastRowScaler() << {
            'normed_columns': 'concated_columns'
        }

        pipeline = Pipeline([concat, normed])
        out_df = pipeline.fit_transform(in_df)
        cols = [
            'concated_columns.' + s
            for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length']
        ]
        cols.extend([
            'normed_columns.' + s
            for s in ['Sepal_Length', 'Sepal_Width', 'Petal_Length']
        ])
        sum = out_df[cols].sum().sum()
        assert_greater(sum, 17.309, "sum should be greater than %s" % 17.309)
        assert_less(sum, 17.3102, "sum should be less than %s" % 17.31)

Ejemplo n.º 7

0

Mostrar archivo

 def test_ssweembedding(self):
     wordvectors = pd.DataFrame(data=dict(w1=["like", "hate", "okay"],
                                          w2=["great", "horrible",
                                              "lukewarm"],
                                          w3=["awesome", "worst",
                                              "boring"]))
     mycols = ['w1', 'w2', 'w3']
     concat = ColumnConcatenator() << {'features': mycols}
     sswe = WordEmbedding() << 'features'
     pipeline = Pipeline([concat, sswe])
     y = pipeline.fit_transform(wordvectors)
     y = y[[col for col in y.columns if 'features' in col]]
     assert_almost_equal(y.sum().sum(), -97.6836, decimal=4,
                         err_msg="Sum should be %s" % -97.6836)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_pcatransformer.py Proyecto: zyw400/NimbusML-1

 def test_PcaTransformer(self):
     df = get_dataset("infert").as_df()
     X = [
         'age', 'parity', 'induced', 'spontaneous', 'stratum',
         'pooled.stratum'
     ]
     pipe = Pipeline(
         [ColumnConcatenator() << {
             'X': X
         }, PcaTransformer(rank=3) << 'X'])
     y = pipe.fit_transform(df[X].astype(numpy.float32))
     y = y[['X.0', 'X.1', 'X.2']]
     assert_almost_equal(y.sum().sum(),
                         11.293087,
                         decimal=3,
                         err_msg="Sum should be %s" % 11.293087)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_errors.py Proyecto: zyw400/NimbusML-1

    def test_char_tokenizer(self):

        customer_reviews = pd.DataFrame(data=dict(review=[
            "I really did not like the taste of it",
            "It was surprisingly quite good!",
            "I will never ever ever go to that place again!!",
            "The best ever!! It was amazingly good and super fast",
            "I wish I had gone earlier, it was that great",
            "somewhat dissapointing. I'd probably wont try again",
            "Never visit again... rascals!"
        ]))

        tokenize = CharTokenizer(['review'])
        concat = ColumnConcatenator() >> 'features' << [['review']]
        pipeline = nimbusmlPipeline([concat, tokenize])
        y = pipeline.fit_transform(customer_reviews)
        assert y is not None

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_cv.py Proyecto: AvenSun/NimbusML

 def check_cv_with_defaults2(self,
                             label_name='Label',
                             group_id='GroupId',
                             features='Features_1',
                             **params):
     steps = [
         OneHotHashVectorizer(output_kind='Key') << {
             group_id: group_id
         },
         ColumnConcatenator() << {
             'Features': [features]
         },
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id
         }
     ]
     data = self.data_wt_rename(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_cv.py Proyecto: zyw400/NimbusML-1

 def check_cv_with_defaults2(self,
                             label_name='Label',
                             group_id='GroupId',
                             features='Features_1',
                             **params):
     # REVIEW: Replace back ToKey() with OneHotHashVectorizer()  and reinstate metrics checks
     # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved.
     params.pop('expected_metrics', None)
     steps = [
         ToKey() << {
             group_id: group_id
         },
         ColumnConcatenator() << {
             'Features': [features]
         },
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id
         }
     ]
     data = self.data_wt_rename(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_fromkey.py Proyecto: zyw400/NimbusML-1

    def test_fromkey_multiple_columns(self):
        df = pandas.DataFrame(data=dict(
            num1=[0, 1, 2, 3, 4, 5, 6],
            cat1=Categorical.from_codes([0, 2, 3, 1, 2, -1, 1],
                                        categories=["a", "b", "c", "d"]),
            cat2=Categorical.from_codes([2, 0, 1, 2, 0, 1, 1],
                                        categories=["e", "f", "g"]),
            num=[0, 1, 2, 3, 4, 5, 6],
            text1=["i", "j", "i", "j", "i", "j", "i"],
            text2=["k", "l", "l", "k", "k", "l", "k"]))

        concat = ColumnConcatenator() << {'textvec': ['text1', 'text2']}
        tokey = ToKey() << ['textvec']
        pipeline = Pipeline([concat, tokey])
        data_idv = pipeline.fit_transform(df)
        assert sorted(
            list(
                data_idv.columns)) == [
            'cat1',
            'cat2',
            'num',
            'num1',
            'text1',
            'text2',
            'textvec.text1',
            'textvec.text2']
        assert list(data_idv['cat1'].cat.categories) == ['a', 'b', 'c', 'd']
        assert list(data_idv['cat1'].cat.codes) == [0, 2, 3, 1, 2, -1, 1]
        assert list(data_idv['cat2'].cat.categories) == ['e', 'f', 'g']
        assert list(data_idv['cat2'].cat.codes) == [2, 0, 1, 2, 0, 1, 1]
        assert list(
            data_idv['textvec.text1'].cat.categories) == [
            'i', 'k', 'j', 'l']
        assert list(data_idv['textvec.text1'].cat.codes) == [
            0, 2, 0, 2, 0, 2, 0]
        assert list(
            data_idv['textvec.text2'].cat.categories) == [
            'i', 'k', 'j', 'l']
        assert list(data_idv['textvec.text2'].cat.codes) == [
            1, 3, 3, 1, 1, 3, 1]

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_pipeline_split_models.py Proyecto: yazici/NimbusML

    def test_vectorized_with_concat_output_predictor_model(self):
        """
        This test shows how to prepend ColumnConcatenator transform
        to outputted predictor model from combined (with featurizers) pipeline
        so it successfully runs on featurized data with vectors.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create, fit and score with combined model.
        # Output predictor model separately.
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # train ColumnConcatenator on featurized data
        concat_pipeline = Pipeline(
            [ColumnConcatenator(columns={'c0': ['c0.a', 'c0.b']})])
        concat_pipeline.fit(df)

        # Load predictor pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        # combine concat and predictor models and score
        combined_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)
        result_2 = combined_predictor_pipeline.predict(df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])

Ejemplo n.º 14

0

Mostrar archivo

Archivo: test_exports.py Proyecto: zyw400/NimbusML-1

    def test_get_fit_info_ranker(self):
        file_path = get_dataset("gen_tickettrain").as_filepath()
        file_schema = 'sep=, col=Label_1:R4:0 col=GroupId_2:TX:1 ' \
                      'col=Features_3:R4:3-5'
        train_stream = FileDataStream(file_path, schema=file_schema)
        pipeline = Pipeline([
            ToKey() << {
                'GroupId_2': 'GroupId_2'
            },
            ColumnConcatenator() << {
                'Features': ['Features_3']
            },
            LightGbmRanker() << {
                Role.Feature: 'Features',
                Role.Label: 'Label_1',
                Role.GroupId: 'GroupId_2'
            }
        ])

        info = pipeline.get_fit_info(train_stream)
        last = info[0][-1]
        inp = last['inputs']
        assert 'GroupId:GroupId_2' in inp

Ejemplo n.º 15

0

Mostrar archivo

Archivo: OneHotVectorizer_df.py Proyecto: zyw400/NimbusML-1

from nimbusml.preprocessing.schema import ColumnConcatenator

data = pd.DataFrame({'month': ['Jan', 'Feb'], 'year': ['1988', '1978']})

# not concatenated
xf = OneHotVectorizer()
features = xf.fit_transform(data)
print(features.head())
#
#   month.Feb  month.Jan  year.1978  year.1988
# 0        0.0        1.0        0.0        1.0
# 1        1.0        0.0        1.0        0.0

# input columns concatenated into vector type
pipe = Pipeline([
    ColumnConcatenator(columns={'f': ['month', 'year']}),
    OneHotVectorizer(columns=['f']),
])
features2 = pipe.fit_transform(data)
print(features2.head())
#   f.month.1978  f.month.1988  f.month.Feb  f.month.Jan  f.year.1978  \
# 0           0.0           0.0          0.0          1.0          0.0
# 1           0.0           0.0          1.0          0.0          1.0
#
#   f.year.1988  f.year.Feb  f.year.Jan month  year
# 0          1.0         0.0         0.0   Jan  1988
# 1          0.0         0.0         0.0   Feb  1978

# input columns concatenated, output_kind = "Bag"
pipe = Pipeline([
    ColumnConcatenator(columns={'f': ['month', 'year']}),

Ejemplo n.º 16

0

Mostrar archivo

Archivo: Concat_Drop_Select_Columns_df.py Proyecto: zyw400/NimbusML-1

# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

mycols = [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa'
]

# drop 'Species' column using ColumnDropper Transform
# select mycols for training using ColumnConcatenator transform
dropcols = ColumnDropper() << 'Species'
concat = ColumnConcatenator() << {Role.Feature: mycols}

pipeline = Pipeline([dropcols, concat, LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)
scores1 = pipeline.predict(X_test)

# Select mycols using SelectColumns Transform
select = ColumnSelector() << mycols
pipeline.fit(X_train, y_train)
pipeline2 = Pipeline([select, LogisticRegressionClassifier()])
scores2 = pipeline.predict(X_test)

# Verify that we get identical results in both Experiments
print(scores1.head())
print(scores2.head())

Ejemplo n.º 17

0

Mostrar archivo

# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...

# TODO: bug 244702
# As discussed, users can only use one column for this transform to select
# features from. We concatenate the target columns
# into one 'Feature' column with Vector type. The output, i.e. the selected
# features, will be named Features.*, and two slots
# are selected. We will support multiple columns in the future to
# concatenate columns automatically.
pip = Pipeline([
    ColumnConcatenator(
        columns={
            'Features': [
                'row_num',
                'spontaneous',
                'age',
                'parity',
                'induced']}),
    MutualInformationSelector(
        columns='Features',
        label='case',
        slots_in_output=2)  # only accept one column
])
pip.fit_transform(data).head()
#    Features.row_num  Features.spontaneous   age  case education  induced  ...
# 0               1.0                   2.0  26.0   1.0    0-5yrs      1.0  ...
# 1               2.0                   0.0  42.0   1.0    0-5yrs      1.0  ...
# 2               3.0                   0.0  39.0   1.0    0-5yrs      2.0  ...
# 3               4.0                   0.0  34.0   1.0    0-5yrs      2.0  ...
# 4               5.0                   1.0  35.0   1.0   6-11yrs      1.0  ...

Ejemplo n.º 18

0

Mostrar archivo

Archivo: ColumnConcatenator.py Proyecto: justinormont/NimbusML-1

from nimbusml.preprocessing.schema import ColumnConcatenator

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32)
print(data.head())
#    age  case education  induced  parity  pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs      1.0     6.0             3.0      1.0  ...
# 1  42.0   1.0    0-5yrs      1.0     1.0             1.0      2.0  ...
# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...

# transform usage
xf = ColumnConcatenator(columns={'features': ['age', 'parity', 'induced']})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
# Feature is a vectory type column, when a dataset with vectortype column is
# the final output, the vector column will convert into multiple columns for
# each slot.
#    age  case education  features.age  features.induced  features.parity  ...
# 0  26.0   1.0    0-5yrs          26.0               1.0              6.0  ...
# 1  42.0   1.0    0-5yrs          42.0               1.0              1.0  ...
# 2  39.0   1.0    0-5yrs          39.0               2.0              6.0  ...
# 3  34.0   1.0    0-5yrs          34.0               2.0              4.0  ...
# 4  35.0   1.0   6-11yrs          35.0               1.0              3.0  ...

Ejemplo n.º 19

0

Mostrar archivo

###############################################################################
# CharTokenizer: pre-trained transform to analyze sentiment of free-form
# text.
import pandas
from nimbusml import Pipeline, Role
from nimbusml.preprocessing.schema import ColumnConcatenator
from nimbusml.preprocessing.text import CharTokenizer

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it",
    "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"]))

tokenize = CharTokenizer() << ['review']
concat = ColumnConcatenator() << {Role.Feature: [['review']]}

pipeline = Pipeline([concat, tokenize])

# REVIEW: System.NullReferenceException
tokenize.fit(customer_reviews)
y = tokenize.transform(customer_reviews)

# view the sentiment scores!!
print(y)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: test_export_to_onnx.py Proyecto: xadupre/NimbusML

    'MutualInformationSelector',
    'NaiveBayesClassifier',
    'CountSelector',
    'KMeansPlusPlus',
    'ToKey',
    'ColumnSelector'
}

INSTANCES = {
    'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
        feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
    'Binner': Binner(num_bins=3),
    'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
        'Sepal_Length',
        'Sepal_Width',
        'Petal_Length',
        'Petal_Width',
        'Setosa']}),
    'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
    'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
    'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
    'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
    'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
    'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
    'FromKey': Pipeline([
        ToKey(columns=['Sepal_Length']),
        FromKey(columns=['Sepal_Length'])

Ejemplo n.º 21

0

Mostrar archivo

Archivo: PcaTransformer_df.py Proyecto: yazici/NimbusML

# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs

train_file = get_dataset("infert").as_filepath()
schema = "col=none:R4:0 col=education:R4:1 col=age:R4:2 col=parity:R4:3 " \
         "col=induced:R4:4 col=case:R4:5 col=spontaneous:R4:6 " \
         "col=stratum:R4:7 col=pooledstratum:R4:8 col=educationstr:R4:9 " \
         "sep=, header=+"
fds = FileDataStream(train_file, schema=schema)

# target and features columns
y = 'case'
X = ['age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooledstratum']

# observe gradual impact of dimensionality reduction on AUC
# reducing dimensions should degrade signal gradually, while
# maintaining the traits in original dataset as much as possible.
for rank in range(len(X), 2, -1):
    print('Number of dimensions=', rank)
    pipe = Pipeline([
        ColumnConcatenator() << {
            'X': X
        },  # X is VectorDataViewType column
        PcaTransformer(rank=rank) << 'X',  # find principal components of X
        LightGbmBinaryClassifier()
    ])
    metrics, scores = pipe.fit(fds, y).test(fds, y)
    print('AUC=', metrics['AUC'].values)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: test_exports.py Proyecto: zyw400/NimbusML-1

    def test_pipeline_exports_complex(self):

        name = "test_pipeline_exports_complex.csv"
        with open(name, "w") as f:
            f.write(_sentiments)

        transform_1 = NGramFeaturizer() << {'transformed1': 'SentimentText'}
        transform_2 = OneHotVectorizer() << 'SentimentSource'
        transform_3 = ColumnConcatenator() << {
            'finalfeatures': ['transformed1', 'SentimentSource']
        }
        algo = FastTreesBinaryClassifier() << {
            Role.Feature: 'finalfeatures',
            Role.Label: "Positive"
        }

        exp = Pipeline([transform_1, transform_2, transform_3, algo])

        stream = FileDataStream.read_csv(name, sep="\t")
        res = dot_export_pipeline(exp, stream).strip("\n\r ")
        exp = """
                digraph{
                  orientation=portrait;
                  sch0[label="<f0> ItemID|<f1> Sentiment|<f2> \
SentimentSource|<f3> SentimentText|<f4> RowNum|<f5> \
Positive|<f6> Train|<f7> Small",shape=record,fontsize=8];

                  node1[label="NGramFeaturizer",shape=box,style="filled,\
rounded",color=cyan,fontsize=12];
                  sch0:f3 -> node1;
                  sch1[label="<f0> transformed1|<f1> \
transformed1_TransformedText",shape=record,fontsize=8];
                  node1 -> sch1:f0;
                  node1 -> sch1:f1;

                  node2[label="OneHotVectorizer",shape=box,\
style="filled,rounded",color=cyan,fontsize=12];
                  sch0:f2 -> node2;
                  sch2[label="<f0> SentimentSource",shape=record,\
fontsize=8];
                  node2 -> sch2:f0;

                  node3[label="ColumnConcatenator",shape=box,\
style="filled,rounded",color=cyan,fontsize=12];
                  sch1:f0 -> node3;
                  sch2:f0 -> node3;
                  sch3[label="<f0> finalfeatures",shape=record,fontsize=8];
                  node3 -> sch3:f0;

                  node4[label="FastTreesBinaryClassifier",shape=box,\
style="filled,rounded",color=yellow,fontsize=12];
                  sch3:f0 -> node4 [label="Feature",fontsize=8];
                  sch0:f5 -> node4 [label="Label",fontsize=8];
                  sch4[label="<f0> PredictedLabel|<f1> \
PredictedProba|<f2> Score",shape=record,fontsize=8];
                  node4 -> sch4:f0;
                  node4 -> sch4:f1;
                  node4 -> sch4:f2;
                }
                """.replace("                ", "").strip("\n\r ")
        assert res == exp