def test_pipeline_info(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) infos = exp.get_fit_info(df)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['education', 'workclass', 'yy'], 'type': 'start', 'outputs': ['education', 'workclass', 'yy'] }, { 'name': 'TypeConverter', 'inputs': ['yy'], 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'], 'type': 'transform' }, { 'name': 'MeanVarianceScaler', 'inputs': ['new_y'], 'type': 'transform', 'outputs': ['new_y'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'OneHotVectorizer', 'inputs': ['workclass', 'education'], 'type': 'transform', 'outputs': ['workclass', 'education'], 'schema_after': ['education', 'workclass', 'yy', 'new_y'] }, { 'name': 'ColumnDropper', 'type': 'transform', 'schema_after': ['education', 'workclass', 'new_y'], 'inputs': ['education', 'workclass', 'yy', 'new_y'], 'outputs': ['education', 'workclass', 'new_y'] }, { 'name': 'FastLinearRegressor', 'inputs': ['Feature:education,workclass', 'Label:new_y'], 'type': 'regressor', 'outputs': ['Score'], 'schema_after': ['Score'] }] if infos != exp: raise Exception(infos)
def test_syntax8_label(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop('yy', axis=1) exp = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} ]) exp.fit(df, verbose=0) assert exp.nodes[-1].feature_column_ == 'Features' assert exp.nodes[-1].label_column_ == 'new_y' # The pipeline requires it now as it is transformed all along. X['yy'] = 0.0 prediction = exp.predict(X, verbose=0) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 0.4: raise Exception(prediction) if prediction['Score'].max() > 2.00: raise Exception(prediction)
def test_syntax7_rename(self): # Error message are usually not informative enough. # Missing column --> no indication of other columns. # Error is (one transform should handle it) # 'The label column 'y' of the training data has a data type # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>. # Type must be Bool, R4, R8 or Key with two classes. df = pandas.DataFrame( dict( education=[ 'A', 'B', 'A', 'B', 'A'], workclass=[ 'X', 'X', 'Y', 'Y', 'Y'], y=[ 'red', 'white', 'red', 'white', 'white'])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << 'y', OneHotVectorizer() << ['workclass', 'education'], TypeConverter(result_type='R4') << {'yi': 'y'}, Drop() << 'y', FastLinearBinaryClassifier(max_iterations=1) << 'yi' ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) assert prediction.min() > 0.01 assert prediction.max() < 0.05
def test_syntax6_regular_expression(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': ['f%d' % i for i in range(1, 4)] }, Drop() << '~Features', FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax6_change_role(self): # REVIEW: the pipeline drops all columns but one --> # nimbusml still thinks the Features are eduction, workclass # and does not automatically detects that the only remaining # columns should play that role # (maybe because the label column is here too even though # the only remaining column without a role is Features). df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << {'f1': 'education'}, OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_pipeline_exports(self): import graphviz.backend df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) gr = img_export_pipeline(exp, df) name = next(tempfile._get_candidate_names()) try: gr.render(name) assert os.path.exists(name) except graphviz.backend.ExecutableNotFound: warnings.warn('Graphviz is not installed.') if os.path.exists(name): os.remove(name)
def test_pipeline_info_strategy_previous_drop(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop('yy', axis=1) y = df['yy'] exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], Drop() << ['education'], FastLinearRegressor() ]) infos = exp.get_fit_info(X, y)[0] for inf in infos: if 'operator' in inf: del inf['operator'] exp = [{ 'name': None, 'schema_after': ['education', 'workclass', 'yy'], 'type': 'start', 'outputs': ['education', 'workclass', 'yy'] }, { 'name': 'OneHotVectorizer', 'inputs': ['workclass', 'education'], 'type': 'transform', 'outputs': ['workclass', 'education'], 'schema_after': ['education', 'workclass', 'yy'] }, { 'name': 'ColumnDropper', 'type': 'transform', 'schema_after': ['workclass', 'yy'], 'inputs': ['education', 'workclass', 'yy'], 'outputs': ['workclass', 'yy'] }, { 'name': 'FastLinearRegressor', 'inputs': ['Feature:workclass', 'Label:yy'], 'type': 'regressor', 'outputs': ['Score'], 'schema_after': ['Score'] }] assert infos == exp
def test_pipeline_exports(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ MeanVarianceScaler() << { 'new_y': 'yy' }, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << { 'Feature': ['workclass', 'education'], Role.Label: 'new_y' } ]) for node in exp.nodes: if hasattr(node, 'label_column'): assert node.label_column == 'new_y' assert exp.nodes[-1].label_column == 'new_y' res = dot_export_pipeline(exp, df).strip("\n\r ") exp = """ digraph{ orientation=portrait; sch0[label="<f0> education|<f1> workclass|<f2> yy", shape=record,fontsize=8]; node1[label="TypeConverter",shape=box,style="filled, rounded",color=cyan,fontsize=12]; sch0:f2 -> node1; sch1[label="<f0> new_y",shape=record,fontsize=8]; node1 -> sch1:f0; node2[label="MeanVarianceScaler",shape=box, style="filled,rounded",color=cyan,fontsize=12]; sch1:f0 -> node2; sch2[label="<f0> new_y",shape=record,fontsize=8]; node2 -> sch2:f0; node3[label="OneHotVectorizer",shape=box, style="filled,rounded",color=cyan,fontsize=12]; sch0:f1 -> node3; sch0:f0 -> node3; sch3[label="<f0> workclass|<f1> education", shape=record,fontsize=8]; node3 -> sch3:f0; node3 -> sch3:f1; node5[label="FastLinearRegressor",shape=box, style="filled,rounded",color=yellow,fontsize=12]; sch3:f1 -> node5 [label="Feature",fontsize=8]; sch3:f0 -> node5 [label="Feature",fontsize=8]; sch2:f0 -> node5 [label="Label",fontsize=8]; sch5[label="<f0> Score",shape=record,fontsize=8]; node5 -> sch5:f0; } """.replace(" ", "").strip("\n\r ") if res.replace("\n", "").replace(" ", "") != exp.replace( "\n", "").replace(" ", ""): raise Exception(res)