Esempio n. 1
0
    def test_automl_immediate_quit(self):
        """ Make sure the AutoMLSearch quits when error_callback is defined and does no further work. """
        self._caplog.clear()
        X, y = self.X_y_binary
        pipelines = [
            TestPipelineFast({}),
            TestPipelineWithFitError({}),
            TestPipelineSlow({})
        ]
        automl = AutoMLSearch(X_train=X,
                              y_train=y,
                              problem_type="binary",
                              engine=self.parallel_engine,
                              max_iterations=4,
                              allowed_pipelines=pipelines,
                              error_callback=raise_error_callback,
                              optimize_thresholds=False)

        # Ensure the broken pipeline raises the error
        with pytest.raises(Exception, match="Yikes"):
            automl.search()

        # Make sure the automl algorithm stopped after the broken pipeline raised
        assert len(automl.full_rankings) < len(pipelines)
        assert TestPipelineFast.custom_name in set(
            automl.full_rankings["pipeline_name"])
        assert TestPipelineSlow.custom_name not in set(
            automl.full_rankings["pipeline_name"])
        assert TestPipelineWithFitError.custom_name not in set(
            automl.full_rankings["pipeline_name"])
Esempio n. 2
0
    def test_automl(self):
        """ Comparing the results of parallel and sequential AutoML to each other."""
        X, y = self.X_y_binary
        par_automl = AutoMLSearch(X_train=X,
                                  y_train=y,
                                  problem_type="binary",
                                  engine=self.parallel_engine)
        par_automl.search()
        parallel_rankings = par_automl.full_rankings

        seq_automl = AutoMLSearch(X_train=X,
                                  y_train=y,
                                  problem_type="binary",
                                  engine=self.sequential_engine)
        seq_automl.search()
        sequential_rankings = seq_automl.full_rankings

        par_results = parallel_rankings.drop(columns=["id"])
        seq_results = sequential_rankings.drop(columns=["id"])

        assert all(
            seq_results["pipeline_name"] == par_results["pipeline_name"])
        assert np.allclose(np.array(seq_results["mean_cv_score"]),
                           np.array(par_results["mean_cv_score"]))
        assert np.allclose(np.array(seq_results["validation_score"]),
                           np.array(par_results["validation_score"]))
        assert np.allclose(
            np.array(seq_results["percent_better_than_baseline"]),
            np.array(par_results["percent_better_than_baseline"]))
Esempio n. 3
0
    def test_automl_max_iterations(self):
        """ Making sure that the max_iterations parameter limits the number of pipelines run. """
        X, y = self.X_y_binary
        max_iterations = 4
        par_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.parallel_engine,
                                  max_iterations=max_iterations)
        par_automl.search()
        parallel_rankings = par_automl.full_rankings

        seq_automl = AutoMLSearch(X_train=X, y_train=y, problem_type="binary", engine=self.sequential_engine,
                                  max_iterations=max_iterations)
        seq_automl.search()
        sequential_rankings = seq_automl.full_rankings

        assert len(sequential_rankings) == len(parallel_rankings) == max_iterations
Esempio n. 4
0
def test_score_batch_works(mock_score, pipeline_score_side_effect, X_y_binary,
                           dummy_binary_pipeline_class, stackable_classifiers,
                           caplog):

    exceptions_to_check = []
    expected_scores = {}
    for i, e in enumerate(pipeline_score_side_effect):
        # Ensemble pipeline has different name
        pipeline_name = f"Pipeline {i}" if i < len(
            pipeline_score_side_effect) - 1 else "Templated Pipeline"
        scores = no_exception_scores
        if isinstance(e, PipelineScoreError):
            scores = {"F1": np.nan, "AUC": np.nan, "Log Loss Binary": np.nan}
            scores.update(e.scored_successfully)
            exceptions_to_check.append(f"Score error for {pipeline_name}")

        expected_scores[pipeline_name] = scores

    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_iterations=1,
                          allowed_pipelines=[dummy_binary_pipeline_class])

    engine = SequentialEngine(X_train=automl.X_train,
                              y_train=automl.y_train,
                              automl=automl)

    def make_pipeline_name(index):
        class DummyPipeline(dummy_binary_pipeline_class):
            custom_name = f"Pipeline {index}"

        return DummyPipeline({'Mock Classifier': {'a': index}})

    pipelines = [
        make_pipeline_name(i)
        for i in range(len(pipeline_score_side_effect) - 1)
    ]
    ensemble_input_pipelines = [
        make_pipeline_from_components([classifier], problem_type="binary")
        for classifier in stackable_classifiers[:2]
    ]
    ensemble = make_pipeline_from_components(
        [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)],
        problem_type="binary")
    pipelines.append(ensemble)

    def score_batch_and_check():
        caplog.clear()
        with patch('evalml.pipelines.BinaryClassificationPipeline.score'
                   ) as mock_score:
            mock_score.side_effect = pipeline_score_side_effect

            scores = engine.score_batch(
                pipelines, X, y, objectives=["Log Loss Binary", "F1", "AUC"])
            assert scores == expected_scores
            for exception in exceptions_to_check:
                assert exception in caplog.text

    # Test scoring before search
    score_batch_and_check()

    automl.search()

    # Test scoring after search
    score_batch_and_check()
Esempio n. 5
0
def test_train_batch_works(mock_score, pipeline_fit_side_effect, X_y_binary,
                           dummy_binary_pipeline_class, stackable_classifiers,
                           caplog):

    exceptions_to_check = [
        str(e) for e in pipeline_fit_side_effect if isinstance(e, Exception)
    ]

    X, y = X_y_binary

    automl = AutoMLSearch(X_train=X,
                          y_train=y,
                          problem_type='binary',
                          max_time=1,
                          max_iterations=2,
                          train_best_pipeline=False,
                          n_jobs=1)

    engine = SequentialEngine(X_train=automl.X_train,
                              y_train=automl.y_train,
                              automl=automl)

    def make_pipeline_name(index):
        class DummyPipeline(dummy_binary_pipeline_class):
            custom_name = f"Pipeline {index}"

        return DummyPipeline({'Mock Classifier': {'a': index}})

    pipelines = [
        make_pipeline_name(i)
        for i in range(len(pipeline_fit_side_effect) - 1)
    ]
    ensemble_input_pipelines = [
        make_pipeline_from_components([classifier], problem_type="binary")
        for classifier in stackable_classifiers[:2]
    ]
    ensemble = make_pipeline_from_components(
        [StackedEnsembleClassifier(ensemble_input_pipelines, n_jobs=1)],
        problem_type="binary")
    pipelines.append(ensemble)

    def train_batch_and_check():
        caplog.clear()
        with patch('evalml.pipelines.BinaryClassificationPipeline.fit'
                   ) as mock_fit:
            mock_fit.side_effect = pipeline_fit_side_effect

            trained_pipelines = engine.train_batch(pipelines)

            assert len(trained_pipelines) == len(
                pipeline_fit_side_effect) - len(exceptions_to_check)
            assert mock_fit.call_count == len(pipeline_fit_side_effect)
            for exception in exceptions_to_check:
                assert exception in caplog.text

    # Test training before search is run
    train_batch_and_check()

    # Test training after search.
    automl.search()

    train_batch_and_check()
Esempio n. 6
0
def url_data():
    about()
    st.info("This feature has limited functionality")
    url=st.text_input("Webpage URL",help="Enter a url where your data is placed")
    if url=="":
        st.info("Please enter a valid input to get started")
        st.stop()
    
    #getting data Column names as user input
    column_name=st.text_input("enter candidadte column Name",key="value")
    value_list=column_name.split(",")
    
    #getting data example for refferances
    candidate=st.text_input("Candidate example value",key="candidates",help="use ; as seperator to enter another value")
    items_list=candidate.split(";")
    #st.write(items)
    
# create object
    scraper = AutoScraper()
# feeding for scraping
    final_result = scraper.build(url,items_list)
# display result
    
    
    results=scraper.get_result_similar(url,grouped=True,keep_order=True)
    result={}
    for key,value in results.items():
        if value not in result.values():
            result[key]=value
            
    orient_df=pd.DataFrame.from_dict(result,orient="index")
    df=orient_df.transpose()
    
    df.columns=value_list
    df.fillna(value=pd.np.nan,inplace=True)
    st.write(df)
    
    cols=df.columns.tolist()
    col1,col2=st.beta_columns(2)
 
    target=col1.selectbox("Select Target", cols,key="target")


    
    typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary']
    p_type=col2.selectbox("Select problem type",typelist,key="p_type")     
    st.write("hey")
    x=df.drop(columns=target)
    y=df[target]
    x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type)

    automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type)
    automl.search()


    rank=automl.rankings

#checking best pipeline     ###############################################################

    best_pipeline=automl.best_pipeline
    description=automl.describe_pipeline(automl.rankings.iloc[0]["id"])

### OPtimize the code 


### Evaluate on hold out data
    problem_list=['binary','time series binary']
    problem_list2=['multiclass','time series multiclass']

    cola,col_b,colc=st.beta_columns(3)
    
    if p_type in problem_list:
        objective=col_b.selectbox("select objective",objectives().binary_obj,key="objective selector")  
        best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['f1', 'precision'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict_proba(x_test).to_dataframe()


# for multiclass type problem
    elif p_type in problem_list2:
        objective=col_b.selectbox("select objective",objectives().multiclass_obj,key="objective selector") 
        best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"])

        automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['MCC multiclass', 'accuracy multiclass'],
                                         max_batches=1,
                                         optimize_thresholds=True)

        automl_tunned.search()

        tunned_rankings=automl_tunned.rankings

        tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

        tunned_pipeline= automl_tunned.best_pipeline

        tunned_pipeline.score(x_test, y_test,  objectives=[objective])

        pred=tunned_pipeline.predict(x_test).to_series()

    
# for regression type problems
    else:
                objective=col_b.selectbox("select objective",objectives().regression_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"])
                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['Root Mean Squared Error', 'MSE','MAE'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                tunned_pipeline.fit(x_train,y_train)
                    
                pred=tunned_pipeline.predict(x_test).to_series()
                
    file=open("model_details.txt","w")
    str_dict=repr(tunned_description)
    file.write(str_dict)
    file.close()
    def get_binary_file_downloader_html(bin_file, file_label='File'):
            with open(bin_file, 'rb') as f:
                data = f.read()
                bin_str = base64.b64encode(data).decode()
                href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>'
                return href                
    col1,col2,col3=st.beta_columns([1,1,1])        
    if col2.button("Predict Results",key="output",help="shows results"):
            st.spinner()
            with st.spinner(text='In progress'):
                 st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.")
                 time.sleep(20)
            st.info("Done. Here you go.")
            st.write(pred)

    col11,col12=st.beta_columns([3,1])
    with col11:
        with st.beta_expander("Compare Models"):
                st.write(tunned_rankings)
        
    with col12:
        with st.beta_expander("Best Pipeline"):
                st.success(tunned_pipeline)
                st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
Esempio n. 7
0
def file_data():

   
    uploaded_file = st.file_uploader("Upload Files",type=['csv','xls','xlxs'])

#geeting the file type to read the file as a dataframe

    if uploaded_file is None:
        st.info("Upload a dataset first to start making predictions")
    else:
        
        filename=str(uploaded_file.name)
        file_type=filename.split('.',2)

        if file_type[1]=='csv':
            df=pd.read_csv(uploaded_file)
    
        elif file_type[1]=='xls' or 'xlsx':
            df=pd.read_excel(uploaded_file)
            st.write(df.head())    
#wait for json type in new update 


    

        cols=df.columns.tolist()
        col1,col2=st.beta_columns(2)
        target=col1.selectbox("Select Target", cols,key="target")
        x=df.drop(columns=target)
        y=df[target]

    
        typelist=['binary','multiclass','regression','time series regression','time series multiclass','time series binary']
        p_type=col2.selectbox("Select problem type",typelist,key="p_type")
        

        x_train,x_test,y_train,y_test=evalml.preprocessing.split_data(x,y,problem_type=p_type)

    #its training time
        automl = AutoMLSearch(X_train=x_train, y_train=y_train, problem_type=p_type)
        automl.search()


        rank=automl.rankings

#checking best pipeline     ###############################################################

        best_pipeline=automl.best_pipeline
        
        
        description=automl.describe_pipeline(automl.rankings.iloc[0]["id"],return_dict=True)
        


        


### Evaluate on hold out data
        problem_list=['binary','time series binary']
        problem_list2=['multiclass','time series multiclass']
        
        binary_obj=[]
        for objective in get_core_objectives(ProblemTypes.BINARY):
            binary_obj.append(objective.name)
        multiclass_obj=[]
        for objective in get_core_objectives(ProblemTypes.MULTICLASS):
            multiclass_obj.append(objective.name)
        regression_obj=[]            
        for objective in get_core_objectives(ProblemTypes.REGRESSION):
            regression_obj.append(objective.name)
        cola,col_b,colc=st.beta_columns([1,3,1])
        
        if p_type in problem_list:
            objective=col_b.selectbox("select objective",binary_obj,key="objective selector") 
            best_pipeline.score(x_test, y_test, objectives=["auc","f1","Precision","Recall"])

            automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['f1', 'precision'],
                                         max_batches=1,
                                         optimize_thresholds=True)

            automl_tunned.search()

            tunned_rankings=automl_tunned.rankings

            tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

            tunned_pipeline= automl_tunned.best_pipeline

            tunned_pipeline.score(x_test, y_test,  objectives=[objective])

            pred=tunned_pipeline.predict_proba(x_test).to_dataframe()


# for multiclass type problem
        elif p_type in problem_list2:
                objective=col_b.selectbox("select objective",multiclass_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["log loss multiclass","MCC multiclass","accuracy multiclass"])

                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['MCC multiclass', 'accuracy multiclass'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                pred=tunned_pipeline.predict(x_test).to_series()

    
# for regression type problems
        else:
                objective=col_b.selectbox("select objective",regression_obj,key="objective selector") 
                best_pipeline.score(x_test, y_test, objectives=["r2","MSE","MAE","Root Mean Squared Error"])
                automl_tunned = AutoMLSearch(X_train=x_train, y_train=y_train,
                                         problem_type=p_type,
                                         objective=objective,
                                         additional_objectives=['Root Mean Squared Error', 'MSE','MAE'],
                                         max_batches=1,
                                         optimize_thresholds=True)

                automl_tunned.search()

                tunned_rankings=automl_tunned.rankings

                tunned_description=automl_tunned.describe_pipeline(automl_tunned.rankings.iloc[0]["id"],return_dict=True)

                tunned_pipeline= automl_tunned.best_pipeline

                tunned_pipeline.score(x_test, y_test,  objectives=[objective])

                tunned_pipeline.fit(x_train,y_train)
                    
                pred=tunned_pipeline.predict(x_test).to_series()
                
        col1,col2,col3=st.beta_columns([1,3,1])

        
        file=open("model_details.txt","w")
        str_dict=repr(tunned_description)
        file.write(str_dict)
        file.close()
        def get_binary_file_downloader_html(bin_file, file_label='File'):
            with open(bin_file, 'rb') as f:
                data = f.read()
                bin_str = base64.b64encode(data).decode()
                href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Click Here To Download {file_label}</a>'
                return href
        
        if col2.button("Predict Results",key="output",help="shows results"):
            st.spinner()
            with st.spinner(text='In progress'):
                 st.info("Wait while we are selecting a best algoritham for your problem..Hold your breath.")
                 time.sleep(20)
            st.info("Done. Here you go.")
            st.write(pred)

            col11,col12=st.beta_columns([3,1])
            with col11:
                with st.beta_expander("Compare Models"):
                    st.write(tunned_rankings)
            with col12:
                with st.beta_expander("Pipeline Details"):
                    st.success(tunned_pipeline)
                    st.markdown(get_binary_file_downloader_html('model_details.txt', 'Pipeline Details'), unsafe_allow_html=True)
Esempio n. 8
0
            "fit_intercept": [True, False],
            "max_iter": Integer(50, 100),
            "l1_ratio": Real(0.1, 1.0)
        },
        'Final RF Estimator': {
            "n_estimators": Integer(10, 1000),
            "max_depth": Integer(1, 100),
            "n_bins": Integer(1, 16),
        },
    }


iris = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data.astype(np.float64),
    iris.target.astype(np.float64),
    train_size=0.75,
    test_size=0.25,
    random_state=42)

automl = AutoMLSearch(X_train=X_train,
                      y_train=y_train,
                      problem_type='binary',
                      objective='auc',
                      allowed_pipelines=[WidsExamplePipeline],
                      max_iterations=3,
                      max_batches=3)

automl.search()
print(automl.best_pipeline.parameters)
Esempio n. 9
0
class ModelSelect():
    '''
    auto select model by evalml
    '''
    def __init__(self,
                 problem_type: str,
                 self_pipelines=None,
                 objective=None,
                 **kwds):
        '''
        Parameters
        --------
        problem_type: binary,multiclass,regression
        self_pipelines: define yourself pipline,please use define_pipline generating it
        objective: default by evalml.objectives.FraudCost or you can set to auto,if you want overwrite it please see
        https://evalml.alteryx.com/en/stable/user_guide/objectives.html
        '''
        self.problem_type = problem_type
        if isinstance(objective, dict):
            objective = FraudCost(
                retry_percentage=objective.get('retry_percentage', 0),
                interchange_fee=objective.get('interchange_fee', 0.04),
                fraud_payout_percentage=objective.get('loss_percentage', 0.9),
                amount_col=objective['amount_col'])
        elif objective is None:
            objective = 'auto'
        self.auto_ml = AutoMLSearch(
            problem_type=problem_type,
            allowed_pipelines=self_pipelines,
            objective=objective,
            additional_objectives=['auc', 'f1', 'precision'],
            **kwds)

    def search(self, X: pd.DataFrame, y: pd.Series):
        '''
        Parameters
        --------
        X: train data
        y: lable data
        '''
        self.auto_ml.search(X, y, data_checks=None, show_iteration_plot=False)
        return self.auto_ml.rankings

    @staticmethod
    def feature_importance(pipline, X, y, objective="F1", **kwds):
        '''
        when you find the pipline,you can get the feature_importance,and use it like feature select
        Parameters; if you want drop can use AutoCreate.remove_features
        --------
        pipline: from the search result get the pipeline. self.auto_ml.get_pipeline(id)
        X: train data
        y: lable data
        objective: cost func
        '''
        pipline = pipline.fit(X, y)
        fm_df = calculate_permutation_importance(pipline, X, y, objective,
                                                 **kwds)
        feature_importances = format_importance(fm_df.feature,
                                                fm_df.importance)
        return feature_importances

    @staticmethod
    def define_pipline(problem_type,
                       estimators: list,
                       hyperparameters: dict,
                       preprocessing_components: list = None):
        '''
        define yourself piplines
        Parameters
        --------
        problem_type: binary,multiclass,regression
        estimators: a list contain estimators from eval or SModelTrans generate
        hyperparameters:estimators parameters
        preprocessing_components: a list for processing data,if None will use default. from eval or SModelTrans generate  
        '''
        piplines = []
        pipline_dict = {'binary': BP, 'multiclass': MP, 'regression': RP}
        pipline_type = pipline_dict[problem_type]
        if preprocessing_components is None:
            preprocessing_components = [
                DropNullColumns, Imputer, DateTimeFeaturizer, OneHotEncoder,
                StandardScaler
            ]
        for estimator in estimators:

            class CustomPipeline(pipline_type, estimator):
                custom_name = f"{estimator.name} w/ {' + '.join([component.name for component in preprocessing_components])}"
                component_graph = preprocessing_components + [estimator]
                custom_hyperparameters = hyperparameters

            piplines.append(CustomPipeline)
        return piplines