def test_automl_immediate_quit(self): """ Make sure the AutoMLSearch quits when error_callback is defined and does no further work. """ self._caplog.clear() X, y = self.X_y_binary pipelines = [ TestPipelineFast({}), TestPipelineWithFitError({}), TestPipelineSlow({}) ] automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine, max_iterations=4, allowed_pipelines=pipelines, error_callback=raise_error_callback, optimize_thresholds=False) # Ensure the broken pipeline raises the error with pytest.raises(Exception, match="Yikes"): automl.search() # Make sure the automl algorithm stopped after the broken pipeline raised assert len(automl.full_rankings) < len(pipelines) assert TestPipelineFast.custom_name in set( automl.full_rankings["pipeline_name"]) assert TestPipelineSlow.custom_name not in set( automl.full_rankings["pipeline_name"]) assert TestPipelineWithFitError.custom_name not in set( automl.full_rankings["pipeline_name"])
def test_automl(self): """ Comparing the results of parallel and sequential AutoML to each other.""" X, y = self.X_y_binary par_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine) par_automl.search() parallel_rankings = par_automl.full_rankings seq_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.sequential_engine) seq_automl.search() sequential_rankings = seq_automl.full_rankings par_results = parallel_rankings.drop(columns=["id"]) seq_results = sequential_rankings.drop(columns=["id"]) assert all( seq_results["pipeline_name"] == par_results["pipeline_name"]) assert np.allclose(np.array(seq_results["mean_cv_score"]), np.array(par_results["mean_cv_score"])) assert np.allclose(np.array(seq_results["validation_score"]), np.array(par_results["validation_score"])) assert np.allclose( np.array(seq_results["percent_better_than_baseline"]), np.array(par_results["percent_better_than_baseline"]))
def test_automl_max_iterations(self): """ Making sure that the max_iterations parameter limits the number of pipelines run. """ X, y = self.X_y_binary max_iterations = 4 par_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine, max_iterations=max_iterations) par_automl.search() parallel_rankings = par_automl.full_rankings seq_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.sequential_engine, max_iterations=max_iterations) seq_automl.search() sequential_rankings = seq_automl.full_rankings assert len(sequential_rankings) == len(parallel_rankings) == max_iterations
def test_score_batch_works(mock_score, pipeline_score_side_effect, X_y_binary, dummy_binary_pipeline_class, stackable_classifiers, caplog): exceptions_to_check = [] expected_scores = {} for i, e in enumerate(pipeline_score_side_effect): # Ensemble pipeline has different name pipeline_name = f"Pipeline {i}" if i < len( pipeline_score_side_effect) - 1 else "Templated Pipeline" scores = no_exception_scores if isinstance(e, PipelineScoreError): scores = {"F1": np.nan, "AUC": np.nan, "Log Loss Binary": np.nan} scores.update(e.scored_successfully) exceptions_to_check.append(f"Score error for {pipeline_name}") expected_scores[pipeline_name] = scores X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_iterations=1, allowed_pipelines=[dummy_binary_pipeline_class]) engine = SequentialEngine(X_train=automl.X_train, y_train=automl.y_train, automl=automl) def make_pipeline_name(index): class DummyPipeline(dummy_binary_pipeline_class): custom_name = f"Pipeline {index}" return DummyPipeline({'Mock Classifier': {'a': index}}) pipelines = [ make_pipeline_name(i) for i in range(len(pipeline_score_side_effect) - 1) ] ensemble_input_pipelines = [ make_pipeline_from_components([classifier], problem_type="binary") for classifier in stackable_classifiers[:2] ] ensemble = make_pipeline_from_components( [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)], problem_type="binary") pipelines.append(ensemble) def score_batch_and_check(): caplog.clear() with patch('evalml.pipelines.BinaryClassificationPipeline.score' ) as mock_score: mock_score.side_effect = pipeline_score_side_effect scores = engine.score_batch( pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"]) assert scores == expected_scores for exception in exceptions_to_check: assert exception in caplog.text # Test scoring before search score_batch_and_check() automl.search() # Test scoring after search score_batch_and_check()
def test_train_batch_works(mock_score, pipeline_fit_side_effect, X_y_binary, dummy_binary_pipeline_class, stackable_classifiers, caplog): exceptions_to_check = [ str(e) for e in pipeline_fit_side_effect if isinstance(e, Exception) ] X, y = X_y_binary automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_time=1, max_iterations=2, train_best_pipeline=False, n_jobs=1) engine = SequentialEngine(X_train=automl.X_train, y_train=automl.y_train, automl=automl) def make_pipeline_name(index): class DummyPipeline(dummy_binary_pipeline_class): custom_name = f"Pipeline {index}" return DummyPipeline({'Mock Classifier': {'a': index}}) pipelines = [ make_pipeline_name(i) for i in range(len(pipeline_fit_side_effect) - 1) ] ensemble_input_pipelines = [ make_pipeline_from_components([classifier], problem_type="binary") for classifier in stackable_classifiers[:2] ] ensemble = make_pipeline_from_components( [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)], problem_type="binary") pipelines.append(ensemble) def train_batch_and_check(): caplog.clear() with patch('evalml.pipelines.BinaryClassificationPipeline.fit' ) as mock_fit: mock_fit.side_effect = pipeline_fit_side_effect trained_pipelines = engine.train_batch(pipelines) assert len(trained_pipelines) == len( pipeline_fit_side_effect) - len(exceptions_to_check) assert mock_fit.call_count == len(pipeline_fit_side_effect) for exception in exceptions_to_check: assert exception in caplog.text # Test training before search is run train_batch_and_check() # Test training after search. automl.search() train_batch_and_check()
def url_data(): about() st.info("This feature has limited functionality") url=st.text_input("Webpage URL",help="Enter a url where your data is placed") if url=="": st.info("Please enter a valid input to get started") st.stop() #getting data Column names as user input column_name=st.text_input("enter candidadte column Name",key="value") value_list=column_name.split(",") #getting data example for refferances candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value") items_list=candidate.split(";") #st.write(items) # create object scraper = AutoScraper() # feeding for scraping final_result = scraper.build(url,items_list) # display result results=scraper.get_result_similar(url,grouped=True,keep_order=True) result={} for key,value in results.items(): if value not in result.values(): result[key]=value orient_df=pd.DataFrame.from_dict(result,orient="index") df=orient_df.transpose() df.columns=value_list df.fillna(value=pd.np.nan,inplace=True) st.write(df) cols=df.columns.tolist() col1,col2=st.beta_columns(2) target=col1.selectbox("Select Target", cols,key="target") typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary'] p_type=col2.selectbox("Select problem type",typelist,key="p_type") st.write("hey") x=df.drop(columns=target) y=df[target] x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type) automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type) automl.search() rank=automl.rankings #checking best pipeline ############################################################### best_pipeline=automl.best_pipeline description=automl.describe_pipeline(automl.rankings.iloc[0]["id"]) ### OPtimize the code ### Evaluate on hold out data problem_list=['binary','time series binary'] problem_list2=['multiclass','time series multiclass'] cola,col_b,colc=st.beta_columns(3) if p_type in problem_list: objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['f1', 'precision'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict_proba(x_test).to_dataframe() # for multiclass type problem elif p_type in problem_list2: objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['MCC multiclass', 'accuracy multiclass'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict(x_test).to_series() # for regression type problems else: objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['Root Mean Squared Error', 'MSE','MAE'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) tunned_pipeline.fit(x_train,y_train) pred=tunned_pipeline.predict(x_test).to_series() file=open("model_details.txt","w") str_dict=repr(tunned_description) file.write(str_dict) file.close() def get_binary_file_downloader_html(bin_file, file_label='File'): with open(bin_file, 'rb') as f: data = f.read() bin_str = base64.b64encode(data).decode() href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>' return href col1,col2,col3=st.beta_columns([1,1,1]) if col2.button("Predict Results",key="output",help="shows results"): st.spinner() with st.spinner(text='In progress'): st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.") time.sleep(20) st.info("Done. Here you go.") st.write(pred) col11,col12=st.beta_columns([3,1]) with col11: with st.beta_expander("Compare Models"): st.write(tunned_rankings) with col12: with st.beta_expander("Best Pipeline"): st.success(tunned_pipeline) st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
def file_data(): uploaded_file = st.file_uploader("Upload Files",type=['csv','xls','xlxs']) #geeting the file type to read the file as a dataframe if uploaded_file is None: st.info("Upload a dataset first to start making predictions") else: filename=str(uploaded_file.name) file_type=filename.split('.',2) if file_type[1]=='csv': df=pd.read_csv(uploaded_file) elif file_type[1]=='xls' or 'xlsx': df=pd.read_excel(uploaded_file) st.write(df.head()) #wait for json type in new update cols=df.columns.tolist() col1,col2=st.beta_columns(2) target=col1.selectbox("Select Target", cols,key="target") x=df.drop(columns=target) y=df[target] typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary'] p_type=col2.selectbox("Select problem type",typelist,key="p_type") x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type) #its training time automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type) automl.search() rank=automl.rankings #checking best pipeline ############################################################### best_pipeline=automl.best_pipeline description=automl.describe_pipeline(automl.rankings.iloc[0]["id"],return_dict=True) ### Evaluate on hold out data problem_list=['binary','time series binary'] problem_list2=['multiclass','time series multiclass'] binary_obj=[] for objective in get_core_objectives(ProblemTypes.BINARY): binary_obj.append(objective.name) multiclass_obj=[] for objective in get_core_objectives(ProblemTypes.MULTICLASS): multiclass_obj.append(objective.name) regression_obj=[] for objective in get_core_objectives(ProblemTypes.REGRESSION): regression_obj.append(objective.name) cola,col_b,colc=st.beta_columns([1,3,1]) if p_type in problem_list: objective=col_b.selectbox("select objective",binary_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['f1', 'precision'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict_proba(x_test).to_dataframe() # for multiclass type problem elif p_type in problem_list2: objective=col_b.selectbox("select objective",multiclass_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['MCC multiclass', 'accuracy multiclass'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) pred=tunned_pipeline.predict(x_test).to_series() # for regression type problems else: objective=col_b.selectbox("select objective",regression_obj,key="objective selector") best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"]) automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type, objective=objective, additional_objectives=['Root Mean Squared Error', 'MSE','MAE'], max_batches=1, optimize_thresholds=True) automl_tunned.search() tunned_rankings=automl_tunned.rankings tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True) tunned_pipeline= automl_tunned.best_pipeline tunned_pipeline.score(x_test, y_test, objectives=[objective]) tunned_pipeline.fit(x_train,y_train) pred=tunned_pipeline.predict(x_test).to_series() col1,col2,col3=st.beta_columns([1,3,1]) file=open("model_details.txt","w") str_dict=repr(tunned_description) file.write(str_dict) file.close() def get_binary_file_downloader_html(bin_file, file_label='File'): with open(bin_file, 'rb') as f: data = f.read() bin_str = base64.b64encode(data).decode() href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>' return href if col2.button("Predict Results",key="output",help="shows results"): st.spinner() with st.spinner(text='In progress'): st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.") time.sleep(20) st.info("Done. Here you go.") st.write(pred) col11,col12=st.beta_columns([3,1]) with col11: with st.beta_expander("Compare Models"): st.write(tunned_rankings) with col12: with st.beta_expander("Pipeline Details"): st.success(tunned_pipeline) st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
"fit_intercept": [True, False], "max_iter": Integer(50, 100), "l1_ratio": Real(0.1, 1.0) }, 'Final RF Estimator': { "n_estimators": Integer(10, 1000), "max_depth": Integer(1, 100), "n_bins": Integer(1, 16), }, } iris = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split( iris.data.astype(np.float64), iris.target.astype(np.float64), train_size=0.75, test_size=0.25, random_state=42) automl = AutoMLSearch(X_train=X_train, y_train=y_train, problem_type='binary', objective='auc', allowed_pipelines=[WidsExamplePipeline], max_iterations=3, max_batches=3) automl.search() print(automl.best_pipeline.parameters)
class ModelSelect(): ''' auto select model by evalml ''' def __init__(self, problem_type: str, self_pipelines=None, objective=None, **kwds): ''' Parameters -------- problem_type: binary,multiclass,regression self_pipelines: define yourself pipline,please use define_pipline generating it objective: default by evalml.objectives.FraudCost or you can set to auto,if you want overwrite it please see https://evalml.alteryx.com/en/stable/user_guide/objectives.html ''' self.problem_type = problem_type if isinstance(objective, dict): objective = FraudCost( retry_percentage=objective.get('retry_percentage', 0), interchange_fee=objective.get('interchange_fee', 0.04), fraud_payout_percentage=objective.get('loss_percentage', 0.9), amount_col=objective['amount_col']) elif objective is None: objective = 'auto' self.auto_ml = AutoMLSearch( problem_type=problem_type, allowed_pipelines=self_pipelines, objective=objective, additional_objectives=['auc', 'f1', 'precision'], **kwds) def search(self, X: pd.DataFrame, y: pd.Series): ''' Parameters -------- X: train data y: lable data ''' self.auto_ml.search(X, y, data_checks=None, show_iteration_plot=False) return self.auto_ml.rankings @staticmethod def feature_importance(pipline, X, y, objective="F1", **kwds): ''' when you find the pipline,you can get the feature_importance,and use it like feature select Parameters; if you want drop can use AutoCreate.remove_features -------- pipline: from the search result get the pipeline. self.auto_ml.get_pipeline(id) X: train data y: lable data objective: cost func ''' pipline = pipline.fit(X, y) fm_df = calculate_permutation_importance(pipline, X, y, objective, **kwds) feature_importances = format_importance(fm_df.feature, fm_df.importance) return feature_importances @staticmethod def define_pipline(problem_type, estimators: list, hyperparameters: dict, preprocessing_components: list = None): ''' define yourself piplines Parameters -------- problem_type: binary,multiclass,regression estimators: a list contain estimators from eval or SModelTrans generate hyperparameters:estimators parameters preprocessing_components: a list for processing data,if None will use default. from eval or SModelTrans generate ''' piplines = [] pipline_dict = {'binary': BP, 'multiclass': MP, 'regression': RP} pipline_type = pipline_dict[problem_type] if preprocessing_components is None: preprocessing_components = [ DropNullColumns, Imputer, DateTimeFeaturizer, OneHotEncoder, StandardScaler ] for estimator in estimators: class CustomPipeline(pipline_type, estimator): custom_name = f"{estimator.name} w/ {' + '.join([component.name for component in preprocessing_components])}" component_graph = preprocessing_components + [estimator] custom_hyperparameters = hyperparameters piplines.append(CustomPipeline) return piplines