def test_check_estimator_ColumnDuplicator_pairs(self): df = pandas.DataFrame( data=dict(tokens1=['one_' + str(i) for i in range(8)], tokens2=['two_' + str(i) for i in range(8)])) cd = ColumnDuplicator() << {'tokens3': 'tokens1', 'tokens4': 'tokens1'} y = cd.fit_transform(df) sum = 0 for v in y.values: for c in str(v): sum = sum + ord(c) assert_equal(sum, 19920, "sum of chars should be %s" % 19920)
def test_averagedperceptron_unsupported_losses_syntax(self): df = get_dataset("infert").as_df().drop('row_num', axis=1) X = df y = df['case'] pipeline = Pipeline([ OneHotVectorizer(columns={ 'age1': 'age', 'parity1': 'parity', 'sp1': 'spontaneous' }), OneHotVectorizer(columns={'education_str': 'education_str'}), ColumnDuplicator(columns={'case2': 'case'}), AveragedPerceptronBinaryClassifier( feature=['age1', 'education_str'], label='case') ]) try: model = pipeline.fit(X, y, verbose=0) raise AssertionError("same column name in X and y") except RuntimeError as e: assert "If any step in the pipeline has defined Label" in str(e) X = X.drop('case', axis=1) pipeline = Pipeline([ OneHotVectorizer(columns={ 'age1': 'age', 'parity1': 'parity', 'sp1': 'spontaneous' }), OneHotVectorizer(columns={'education_str': 'education_str'}), # ColumnDuplicator(columns={'case2': 'case'}), # does not work AveragedPerceptronBinaryClassifier( feature=['age1', 'education_str'], label='case') ]) info = pipeline.get_fit_info(df)[0] assert info[-1]['inputs'] != ['Feature:Features', 'Label:case'] model = pipeline.fit(df) y_pred_withpipeline = model.predict(X) assert set(y_pred_withpipeline.columns) == { 'PredictedLabel', 'Probability', 'Score' } assert y_pred_withpipeline.shape == (248, 3)
'ColumnSelector' } INSTANCES = { 'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier( feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']), 'Binner': Binner(num_bins=3), 'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), 'ColumnConcatenator': ColumnConcatenator(columns={'Features': [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa']}), 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << {
############################################################################### # ColumnDuplicator import pandas from nimbusml.preprocessing.schema import ColumnDuplicator df = pandas.DataFrame(data=dict(tokens1=['one_' + str(i) for i in range(8)], tokens2=['two_' + str(i) for i in range(8)])) # duplicate a column cd = ColumnDuplicator() << {'tokens3': 'tokens1'} y = cd.fit_transform(df) # view the three columns print(y)
############################################################################### # ColumnDuplicator from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing.schema import ColumnDuplicator # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',') # transform usage xf = ColumnDuplicator( columns={ 'education_copy': 'education', 'age_copy': 'age'}) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # age age_copy case education education_copy induced parity ... # 0 26 26 1 0-5yrs 0-5yrs 1 6 ... # 1 42 42 1 0-5yrs 0-5yrs 1 1 ... # 2 39 39 1 0-5yrs 0-5yrs 2 6 ... # 3 34 34 1 0-5yrs 0-5yrs 2 4 ... # 4 35 35 1 6-11yrs 6-11yrs 1 3 ...