def test_syntax5_failing(self): df, X, y = self.get_simple_df() vec = OneHotVectorizer() << {'edu1': ['education1']} try: vec.fit_transform(X, verbose=2) assert False except RuntimeError as e: assert "Returned code is -1. Check the log for error messages.." \ in str(e) vec = OneHotVectorizer() << {'edu1': ['education']} res = vec.fit_transform(X) assert res.shape == (5, 5)
def test_syntax5_failing(self): df, X, y = self.get_simple_df() vec = OneHotVectorizer() << {'edu1': ['education1']} try: vec.fit_transform(X, verbose=2) assert False except RuntimeError as e: assert "Error: *** System.ArgumentOutOfRangeException: 'Could not find input column" \ in str(e) vec = OneHotVectorizer() << {'edu1': ['education']} res = vec.fit_transform(X) assert res.shape == (5, 5)
def test_combined_models_support_decision_function(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) transform = OneHotVectorizer(columns={'edu': 'education'}) df = transform.fit_transform(data, as_binary_data_stream=True) feature_cols = [ 'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum' ] predictor = LogisticRegressionBinaryClassifier(feature=feature_cols, label='case') predictor.fit(df) data = FileDataStream.read_csv(path) df = transform.transform(data, as_binary_data_stream=True) result_1 = predictor.decision_function(df) data = FileDataStream.read_csv(path) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.decision_function(data) self.assertTrue(np.array_equal(result_1, result_2))
def test_combine_with_classifier_trained_with_filedatastream(self): path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) transform = OneHotVectorizer(columns={'edu': 'education'}) df = transform.fit_transform(data, as_binary_data_stream=True) feature_cols = [ 'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum' ] predictor = LogisticRegressionBinaryClassifier(feature=feature_cols, label='case') predictor.fit(df) data = FileDataStream.read_csv(path) df = transform.transform(data, as_binary_data_stream=True) result_1 = predictor.predict(df) data = FileDataStream.read_csv(path) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(data) result_1 = result_1.astype(np.int32) result_2 = result_2['PredictedLabel'].astype(np.int32) self.assertTrue(result_1.equals(result_2))
def test_combine_with_classifier_trained_with_y_arg(self): """ Tests a sequence where the initial transform is computed using both X and y input args. Note, any steps after the initial transform will be operating on data where the X and y have been combined in to one dataset. """ np.random.seed(0) df = get_dataset("infert").as_df() X = df.loc[:, df.columns != 'case'] y = df['case'] transform = OneHotVectorizer() << 'education_str' # Passing in both X and y df = transform.fit_transform(X, y, as_binary_data_stream=True) # NOTE: need to specify the label column here because the # feature and label data was joined in the last step. predictor = LogisticRegressionBinaryClassifier(label='case', feature=list(X.columns)) predictor.fit(df) df = transform.transform(X, as_binary_data_stream=True) result_1 = predictor.predict(df) # Combine the models and perform a prediction combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(X) result_2 = result_2['PredictedLabel'].astype(np.float64) self.assertTrue(result_1.equals(result_2))
def test_combine_transform_and_predictor(self): transform = OneHotVectorizer() << 'c0' df = transform.fit_transform(train_df, as_binary_data_stream=True) predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor.fit(df) df = transform.transform(test_df, as_binary_data_stream=True) result_1 = predictor.predict(df) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(test_df) self.assertEqual(result_1[0], result_2.loc[0, 'Score']) self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
def test_fit_predictor_with_idv(self): train_data = { 'c0': ['a', 'b', 'a', 'b'], 'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5] } train_df = pd.DataFrame(train_data).astype({ 'c1': np.float64, 'c2': np.float64 }) test_data = { 'c0': ['a', 'b', 'b'], 'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7] } test_df = pd.DataFrame(test_data).astype({ 'c1': np.float64, 'c2': np.float64 }) # Fit a transform pipeline to the training data transform_pipeline = Pipeline([OneHotVectorizer() << 'c0']) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df, as_binary_data_stream=True) # Fit a predictor pipeline given a transformed BinaryDataStream predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor_pipeline = Pipeline([predictor]) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Create expected result xf = OneHotVectorizer() << 'c0' df = xf.fit_transform(train_df) predictor = OnlineGradientDescentRegressor( label='c2', feature=['c0.a', 'c0.b', 'c1']) predictor.fit(df) df = xf.transform(test_df) expected_result = predictor.predict(df) self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))
def test_fit_transform(self): # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) # transform usage xf = OneHotVectorizer( columns={ 'edu': 'education', 'in': 'induced', 'sp': 'spontaneous'}) # fit and transform res1 = xf.fit_transform(data) res2 = xf.fit(data).transform(data) assert_frame_equal(res1, res2)
def test_combine_with_classifier_trained_with_joined_X_and_y(self): np.random.seed(0) infert_df = get_dataset("infert").as_df() feature_cols = [c for c in infert_df.columns if c != 'case'] transform = OneHotVectorizer() << 'education_str' df = transform.fit_transform(infert_df, as_binary_data_stream=True) predictor = LogisticRegressionBinaryClassifier(label='case', feature=feature_cols) predictor.fit(df) df = transform.transform(infert_df, as_binary_data_stream=True) result_1 = predictor.predict(df) # Combine the models and perform a prediction combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(infert_df) result_2 = result_2['PredictedLabel'].astype(np.float64) self.assertTrue(result_1.equals(result_2))
def test_syntax6_passing(self): df, X, y = self.get_simple_df() vec = OneHotVectorizer() << ['education', 'education2'] res = vec.fit_transform(X) assert res.shape == (5, 5)
def test_syntax_slots_wo_pipeline(self): # data df = get_dataset("infert").as_df() df = df.drop(['row_num', ], axis=1) X = df.drop('case', axis=1) y = df['case'] # transform xf1 = OneHotVectorizer(columns=['age', 'parity', 'education_str']) X_xf1 = xf1.fit_transform(X, verbose=0) assert "age.21" in list(X_xf1.columns) # learner # (1 .a.) model = AveragedPerceptronBinaryClassifier() # (1. b) try: model = AveragedPerceptronBinaryClassifier(feature=['age']) model.fit(X_xf1, y, verbose=0) cont = True assert False except Exception as e: # does not work cont = False print(e) if cont: y_pred = model.predict(X_xf1) assert y_pred.shape == (248, 3) pipeline = Pipeline([ OneHotVectorizer(columns=['age', 'parity', 'education_str']), AveragedPerceptronBinaryClassifier(feature='age') ]) pipeline.fit(X, y, verbose=0) y_pred_withpipeline = pipeline.predict(X) print(y_pred_withpipeline.head()) assert y_pred_withpipeline.shape == (248, 3) metrics, scores = pipeline.test(X, y, output_scores=True) print(metrics) assert scores.shape == (248, 3) assert metrics.shape == (1, 11) # back to X_xf1 print(list(X_xf1.columns)) l1 = list(sorted(set(_.split('.')[-1] for _ in X_xf1.columns))) levels = [['age', 'education', 'education_str', 'parity', 'pooled', 'spontaneous', 'stratum', 'induced'], [''] + l1] names = ['columns', 'slots'] labels = [[], []] ages = [] for _ in X_xf1.columns: spl = _.split('.') l1 = levels[0].index(spl[0]) try: l2 = levels[1].index(spl[1]) except IndexError: l2 = levels[1].index('') labels[0].append(l1) labels[1].append(l2) if spl[0] == 'age': ages.append(l2) X_xf1.columns = pandas.MultiIndex( levels=levels, labels=labels, names=names) print(X_xf1.head(n=2).T) col_ages = [('age', a) for a in ages] print(col_ages) try: model = AveragedPerceptronBinaryClassifier(feature=col_ages) model.fit(X_xf1, y, verbose=0) y_pred = model.predict(X_xf1) assert y_pred.shape == (248, 3) except Exception as e: # Does not work, probably confusion between list and tuple in nimbusml print(e) try: model = AveragedPerceptronBinaryClassifier(feature=['age']) model.fit(X_xf1, y, verbose=0) y_pred = model.predict(X_xf1) assert y_pred.shape == (248, 3) except Exception as e: # Does not work. print(e)
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',', dtype={'spontaneous': str }) # Error with numeric input for ohhv print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # transform usage # set output_kind = "Bag" to featurize slots independently for vector type # columns xf = OneHotVectorizer(columns={ 'edu': 'education', 'in': 'induced', 'sp': 'spontaneous' }) # fit and transform features = xf.fit_transform(data) print(features.head()) # age case edu.0-5yrs edu.12+ yrs edu.6-11yrs education in.0 in.1 ... # 0 26 1 1.0 0.0 0.0 0-5yrs 0.0 1.0 ... # 1 42 1 1.0 0.0 0.0 0-5yrs 0.0 1.0 ... # 2 39 1 1.0 0.0 0.0 0-5yrs 0.0 0.0 ...