def build_automl_config(is_local_training, user_automl_settings, training_dataset, compute_target): featurization_config = FeaturizationConfig() featurization_config.add_column_purpose('passengerCount', 'Numeric') fixed_automl_settings = { 'task': 'regression', 'label_column_name': 'duration', 'verbosity': logging.INFO, 'preprocess': False, 'model_explainability': True, 'featurization': featurization_config } if is_local_training: automl_config = AutoMLConfig(training_data=training_dataset, **fixed_automl_settings, **user_automl_settings) else: automl_config = AutoMLConfig(path=CODE_PATH, training_data=training_dataset, compute_target=compute_target, **fixed_automl_settings, **user_automl_settings) return automl_config
def RunAutoML(): automl_settings = { "name": "AutoML_Demo_Experiment", "iteration_timeout_minutes": 15, "iterations": 3, "n_cross_validations": 5, "primary_metric": 'r2_score', "preprocess": False, "max_concurrent_iterations": 8, "verbosity": logging.INFO } subscription_id = request.json['subscription_id'] print(userData) print(userData[subscription_id]) #return "ok" try: automl_config = AutoMLConfig(task="classification", X=userData[subscription_id][1], y=userData[subscription_id][2], debug_log='automl_errors.log', preprocess=True, **automl_settings, ) experiment=Experiment(userData[subscription_id][0], 'automl_remote') run = experiment.submit(automl_config, show_output=True) run best_model,fitted_model = run.get_output() return 'ok' except: return 'error'
def train_model(train_data, cpu_cluster, exp): target_column_name = 'Gross_Sales' time_column_name = 'Date' time_series_id_column_names = 'Location' max_horizon = 1 forecasting_parameters = ForecastingParameters( time_column_name=time_column_name, forecast_horizon=1, time_series_id_column_names=time_series_id_column_names ) automl_config = AutoMLConfig( task='forecasting', debug_log='automl_daily_gross_errors.log', primary_metric='normalized_root_mean_squared_error', experiment_timeout_hours=1, training_data=train_data, label_column_name=target_column_name, compute_target=cpu_cluster, enable_early_stopping=True, n_cross_validations=3, verbosity=logging.INFO, max_cores_per_iteration=-1, forecasting_parameters=forecasting_parameters ) remote_run = exp.submit(automl_config, show_output=False) best_run, fitted_model = remote_run.get_output() return fitted_model
def auto_train_model(ws, experiment_name, model_name, full_X, full_Y,training_set_percentage, training_target_accuracy): # start a training run by defining an experiment experiment = Experiment(ws, experiment_name) train_X, test_X, train_Y, test_Y = train_test_split(full_X, full_Y, train_size=training_set_percentage, random_state=42) train_Y_array = train_Y.values.flatten() # Configure the automated ML job # The model training is configured to run on the local machine # The values for all settings are documented at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train # Notice we no longer have to scale the input values, as Auto ML will try various data scaling approaches automatically Automl_config = AutoMLConfig(task = 'classification', primary_metric = 'accuracy', max_time_sec = 12000, iterations = 20, n_cross_validations = 3, exit_score = training_target_accuracy, blacklist_algos = ['kNN','LinearSVM'], X = train_X, y = train_Y_array, path='./04-automl/outputs') # Execute the job run = experiment.submit(Automl_config, show_output=True) # Get the run with the highest accuracy value. best_run, best_model = run.get_output() return (best_model, run, best_run)
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] #location = request.json['location'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df['ActionTaken'].values x_df = stock_dataset_df.drop(['ActionTaken'], axis=1) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] #n_cross_validations = request.json['n_cross_validations'] try: automl_config = AutoMLConfig( task=tasks, X=x_df, y=y_df, iterations=iterations, iteration_timeout_minutes=iteration_timeout_minutes, primary_metric=primary_metric, #n_cross_validations=n_cross_validations, preprocess=True, ) experiment = Experiment(ws, ExperimentName) run = experiment.submit(config=automl_config, show_output=True) best_model,fitted_model = run.get_output() return 'ok' except: return 'error'
def main(train_path, pred_path, n_pred, dt, target, time_limit_min): df_train = pd.read_csv(train_path) df_train[dt] = pd.to_datetime(df_train[dt]) time_series_settings = { "time_column_name": dt, "max_horizon": n_pred, "target_lags": "auto", "target_rolling_window_size": "auto" } automl_config = AutoMLConfig(task="forecasting", training_data=df_train, label_column_name=target, n_cross_validations=5, max_cores_per_iteration=-1, path=os.environ["SCRATCH"], experiment_timeout_minutes=time_limit_min, ensemble_download_models_timeout_sec=3600, **time_series_settings) ws = Workspace.from_config() experiment = Experiment(ws, "experiment") best_run, fitted_model = experiment.submit(automl_config, show_output=True).get_output() print("Best pipeline:") try: ensemble = vars(fitted_model.steps[1][1])["_wrappedEnsemble"] print(ensemble.__class__) steps = ensemble.estimators_ except: steps = fitted_model.steps best_pipeline = "" for i, step in enumerate(steps): best_pipeline += f"{i}. {str(step)}\n" print(best_pipeline) pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', -1) print(fitted_model.named_steps["timeseriestransformer"]. get_engineered_feature_names()) featurization_summary = fitted_model.named_steps[ "timeseriestransformer"].get_featurization_summary() print(pd.DataFrame.from_records(featurization_summary)) x_pred = pd.date_range(df_train[dt].iloc[-1], periods=n_pred + 1, freq=pd.infer_freq(df_train[dt]))[1:] y_pred = fitted_model.forecast(forecast_destination=x_pred[-1])[0] # y_pred = fitted_model.forecast(pd.DataFrame({dt: x_pred}))[0] df_pred = pd.DataFrame({dt: x_pred, target: y_pred}) df_pred.to_csv(pred_path, index=False)
def train_model(data_file, random_seed): """Train the automl model.""" target = "utilization" df = pd.read_parquet(data_file) x = df.loc[:, [c for c in df if c != target]].values y = df[target].values project_folder = "./automl" automl_config = AutoMLConfig( task="regression", iteration_timeout_minutes=5, iterations=10, primary_metric="spearman_correlation", n_cross_validations=5, debug_log="automl.log", verbosity=logging.INFO, X=x, y=y, path=project_folder, ) load_dotenv(find_dotenv()) ws = Workspace( workspace_name=getenv("AML_WORKSPACE_NAME"), subscription_id=getenv("AML_SUBSCRIPTION_ID"), resource_group=getenv("AML_RESOURCE_GROUP"), ) experiment = Experiment(ws, getenv("AML_EXPERIMENT_NAME")) local_run = experiment.submit(automl_config, show_output=True) sub_runs = list(local_run.get_children()) best_run = None best_score = 0 for sub_run in sub_runs: props = sub_run.get_properties() if props["run_algorithm"] != "Ensemble": if float(props["score"]) > best_score: best_run = sub_run model_name = "Automl{}".format(str(uuid.uuid4()).replace("-", ""))[:20] best_run.register_model(model_name=model_name, model_path="outputs/model.pkl") # best_run, fitted_model = local_run.get_output() # local_run.register_model( # description="automl meetup best model" # ) print("Model name is {}".format(model_name))
def submit(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any],) -> List[Dict]: """Define what the form has to do after all required slots are filled""" task=tracker.get_slot('task') data=tracker.get_slot('data') column_name=tracker.get_slot('column_name') dispatcher.utter_message(template="utter_doing_task", task=tracker.get_slot('task'),data=tracker.get_slot('data'), column_name=tracker.get_slot('column_name')) # Load the workspace from the saved config file ws = Workspace.from_config() print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name)) df = pd.read_csv(data) train_data, test_data = train_test_split(df, test_size=0.1, random_state=42) label = column_name automl_config = AutoMLConfig(name='Automated ML Experiment', task= task, compute_target='local', training_data = train_data, validation_data = test_data, label_column_name= label, experiment_timeout_minutes=30, iterations=6, primary_metric = 'AUC_weighted', featurization='auto', ) automl_experiment = Experiment(ws, 'mslearn-diabetes-automl') automl_run = automl_experiment.submit(automl_config) best_run, fitted_model = automl_run.get_output() best_run_metrics = best_run.get_metrics() metric_list = [] for metric_name in best_run_metrics: metric = best_run_metrics[metric_name] metric_list.append((metric_name, metric)) return fitted_model, metric_list print("The best model pipeline for the data is") dispatcher.utter_message(text="The best model pipeline for the data is") print(model) dispatcher.utter_message(model) print("The different metrics are") dispatcher.utter_message(text="The different metrics are") print(metrics) dispatcher.utter_message(text=metrics) column_name=tracker.get_slot('column_name'))
def autoMLRegression(self, x_df, y_df): run = Run.get_context() experiment = run.experiment train_data = pd.concat([x_df, y_df], axis=1) column_name = list(y_df)[0] automl_classifier = AutoMLConfig( task='regression', primary_metric='normalized_root_mean_squared_error', experiment_timeout_minutes=15, training_data=train_data, label_column_name=column_name, n_cross_validations=self.k_fold, enable_onnx_compatible_models=True, model_explainability=True) run = experiment.submit(automl_classifier, show_output=True) best_run, fitted_model = run.get_output() return best_run, fitted_model
def train_model(file_path, data, logger): file_name = file_path.split('/')[-1][:-4] print(file_name) logger.info("in train_model") print('data') print(data.head(5)) automl_config = AutoMLConfig(training_data=data, **automl_settings) logger.info("submit_child") local_run = current_step_run.submit_child(automl_config, show_output=False) logger.info(local_run) print(local_run) local_run.wait_for_completion(show_output=True) fitted_model = local_run.get_output() return fitted_model, local_run
def train(self): automl_settings = { "iteration_timeout_minutes": self.iteration_timeout_minutes, "iterations": self.max_n_trials, "primary_metric": self.metric, "verbosity": logging.DEBUG, "n_cross_validations": self.cross_validation_folds, "enable_stack_ensemble": self.use_ensemble } self.data_script = "get_data.py" self.generate_data_script() self.automl_config = AutoMLConfig(task='regression', debug_log='automl_errors.log', compute_target=self.compute_cluster, data_script="get_data.py", **automl_settings) experiment = Experiment(self.ws, 'automl_remote') print("Submitting training run: {}:".format(self.ws)) remote_run = experiment.submit(self.automl_config, show_output=True) print("Results of training run: {}:".format(remote_run))
def train_model(file_path, data, automl_settings, current_step_run): file_name = file_path.split('/')[-1][:-4] print(file_name) print("in train_model") print('data') print(data.head(5)) print(automl_settings) automl_config = AutoMLConfig(training_data=data, **automl_settings) print("submit_child") local_run = current_step_run.submit_child(automl_config, show_output=True) local_run.add_properties({ k: str(many_model_run_properties[k]) for k in many_model_run_properties }) print(local_run) best_child_run, fitted_model = local_run.get_output() return fitted_model, local_run, best_child_run
def setup_training_step(self): prepped_data = self.prepped_data_path.parse_parquet_files( file_extension=None) project_folder = './automl' automl_config = AutoMLConfig(compute_target=self.aml_compute, task="classification", training_data=prepped_data, label_column_name="test_result", path=project_folder, enable_early_stopping=True, featurization='auto', debug_log="automl_errors.log", n_cross_validations=10, **automl_settings) ds = self.ws.get_default_datastore() metrics_output_name = 'metrics_output' best_model_output_name = 'model_output' metrics_data = PipelineData( name='metrics_data', datastore=ds, pipeline_output_name=metrics_output_name, training_output=TrainingOutput(type='Metrics')) model_data = PipelineData(name='best_model_data', datastore=ds, pipeline_output_name=best_model_output_name, training_output=TrainingOutput(type='Model')) self.model_data = model_data automl_step = AutoMLStep(name='automl_module', automl_config=automl_config, passthru_automl_config=False, outputs=[metrics_data, model_data], allow_reuse=True) return automl_step
def main(): print(azureml.core.VERSION) dataset_name = getRuntimeArgs() run = Run.get_context() ws = run.experiment.workspace ds = Dataset.get_by_name(workspace=ws, name=dataset_name) automl_settings = { "task": 'classification', "verbosity": logging.INFO, "primary_metric": 'accuracy', "experiment_timeout_hours": 0.05, "n_cross_validations": 3, "enable_stack_ensemble": False, "enable_voting_ensemble": False, "model_explainability": True, "preprocess": True, "max_cores_per_iteration": -1, "max_concurrent_iterations": 4, "training_data": ds, "drop_column_names": ['Sno'], "label_column_name": 'Risk' } automl_config = AutoMLConfig(**automl_settings) run = run.submit_child(automl_config, show_output=True) best_run, fitted_model = run.get_output() output_dir = './outputs/' os.makedirs(output_dir, exist_ok=True) shutil.copy2('automl.log', output_dir) with open(output_dir + 'best_run.json', 'w') as f: json.dump(best_run, f)
x, y = clean_data(ds1) # In[5]: # Split data into train and test sets from sklearn.model_selection import train_test_split import pandas as pd x_train, x_test, y_train, y_test = train_test_split(x, y) df_train = pd.concat([x_train, y_train], axis=1) from azureml.train.automl import AutoMLConfig automl_config = AutoMLConfig( experiment_timeout_minutes=30, task="classification", primary_metric="accuracy", training_data=df_train, label_column_name="y", n_cross_validations=5) # In[7]: # Submit your automl run ### YOUR CODE HERE ### automl_run= exp.submit(automl_config, show_output=True) # In[8]:
time_series_id_column_names=time_series_id_column_names, target_lags=target_lag, feature_lags=feature_lag, target_rolling_window_size=window_size, seasonality=seasonality #, ) automl_config = AutoMLConfig( # featurization_config, task="forecasting", debug_log="rev_region_forecast_errors.log", primary_metric="normalized_root_mean_squared_error", experiment_timeout_hours=4, training_data=train_dataset, label_column_name=target_column_name, enable_early_stopping=False, #spark_context=sc, #enable this for databricks cluster compute_target=compute_target, # enable this for ml cluster enable_dnn=True, # enable this for ml cluster featurization="auto", n_cross_validations=5, verbosity=logging.INFO, max_concurrent_iterations=9, max_cores_per_iteration=-1, forecasting_parameters=forecasting_parameters, ) # COMMAND ---------- # DBTITLE 1,Train # submit a new training run from azureml.train.automl.run import AutoMLRun
from azureml.train.automl import AutoMLConfig autu_run_config = RunConfiguration(framework='python') automl_config = AutoMLConfig(name="Automated ML Experiment", task='classification', primary_metric='AUC_weighted', compute_target=aml_compute, training_data=train_dataset, validation_data=test_dataset, label_column_name='Label', featurization='auto', iterations=12, max_concurrent_iterations=4)
X_train = df_all.loc[df_all['datetime'] < '2015-10-01', ].drop(X_drop, axis=1) y_train = df_all.loc[df_all['datetime'] < '2015-10-01', Y_keep] X_test = df_all.loc[df_all['datetime'] > '2015-10-15', ].drop(X_drop, axis=1) y_test = df_all.loc[df_all['datetime'] > '2015-10-15', Y_keep] primary_metric = 'AUC_weighted' automl_config = AutoMLConfig( task='classification', preprocess=False, name=experiment_name, debug_log='automl_errors.log', primary_metric=primary_metric, max_time_sec=1200, iterations=2, n_cross_validations=2, verbosity=logging.INFO, X=X_train.values, # we convert from pandas to numpy arrays using .vaules y=y_train. values[:, 0], # we convert from pandas to numpy arrays using .vaules path=project_folder, ) local_run = experiment.submit(automl_config, show_output=True) # Wait until the run finishes. local_run.wait_for_completion(show_output=True) # create new AutoMLRun object to ensure everything is in order ml_run = AutoMLRun(experiment=experiment, run_id=local_run.id)
def main( workspace=None, dataset_trainandvalidate_name=config.get_default_dataset_name( "trainandvalidate"), ): """ Return AutoMLConfig """ if not workspace: workspace = package_utils.get_workspace() args = aml_compute.parse_args() cluster_max_nodes = 5 args.cluster_max_nodes = cluster_max_nodes args.cluster_sku = "Standard_D12_v2" compute_target = aml_compute.main(args) logger.info(msg="main", extra={"compute_target": compute_target.serialize()}) trainandvalidate = Dataset.get_by_name( workspace=workspace, name=dataset_trainandvalidate_name, ) model_settings = { "task": "classification", "primary_metric": "norm_macro_recall", } ensemble_settings = { "iterations": 15, "allowed_models": ["LightGBM", "LogisticRegression", "SGD", "XGBoostClassifier"], "enable_voting_ensemble": True, "enable_stack_ensemble": False, } dataset_settings = { "validation_size": 0.3, "featurization": "auto", "training_data": trainandvalidate, "label_column_name": "Label", } compute_settings = { "compute_target": compute_target, "max_cores_per_iteration": -1, "max_concurrent_iterations": cluster_max_nodes, "experiment_timeout_hours": 1.5, } automl_config = AutoMLConfig( **model_settings, **ensemble_settings, **dataset_settings, **compute_settings, ) return automl_config
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] #location = request.json['location'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df['ActionTaken'].values x_df = stock_dataset_df.drop(['ActionTaken'], axis=1) print(y_df) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] #n_cross_validations = request.json['n_cross_validations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations, "verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', path=os.getcwd(), #compute_target = 'Automlvm', X=x_df, y=y_df, **automl_settings, ) experiment = Experiment(ws, 'automl_local_v2') remote_run = experiment.submit(automl_config, show_output=True) children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) rundata_toJson = rundata.to_json(orient='columns') return rundata_toJson except: return 'error'
ws = run.experiment.workspace def_data_store = ws.get_default_datastore() # Choose a name for the experiment and specify the project folder. experiment_name = 'automl-local-classification' project_folder = './sample_projects/automl-local-classification' experiment = Experiment(ws, experiment_name) primary_metric = 'accuracy' automl_config = AutoMLConfig(task='classification', debug_log='automl_errors.log', primary_metric=primary_metric, iteration_timeout_minutes=60, iterations=2, n_cross_validations=3, verbosity=logging.INFO, X=X_train, y=y_train, path=project_folder) local_run = experiment.submit(automl_config, show_output=True) # Wait until the run finishes. local_run.wait_for_completion(show_output=True) # create new AutoMLRun object to ensure everything is in order ml_run = AutoMLRun(experiment=experiment, run_id=local_run.id) # aux function for comparing performance of runs (quick workaround for automl's _get_max_min_comparator)
workspace = Workspace.from_config(auth=servicePrincipalAuth) compute_manager = BatchAIManager(workspace) compute_target = compute_manager.get_or_create(compute_target_name) print('Prepare environment and code') script_folder = './training' shutil.copy('get_data.py', script_folder) automl_settings = { "max_time_sec": 120, "iterations": 20, "n_cross_validations": 5, "primary_metric": 'AUC_weighted', "blacklist_algos" = ['kNN','LinearSVM'], "preprocess": False, "concurrent_iterations": 5, "verbosity": logging.INFO } automl_config = AutoMLConfig(task = 'classification', debug_log = 'automl_errors.log', path = '.', compute_target = compute_target, data_script = script_folder + '/get_data.py', **automl_settings ) experiment = Experiment(workspace=workspace, name='fashionMNIST_autoML') remote_run = experiment.submit(automl_config, show_output=False)
get_ipython().system('pip3 install --upgrade azureml-sdk azureml-contrib-run') # In[1]: from azureml.core.workspace import Workspace from azureml.core.experiment import Experiment from azureml.train.automl import AutoMLConfig import logging automl_config = AutoMLConfig(task='forecasting', primary_metric='normalized_root_mean_squared_error', iterations=10, X=X_train, y=y_train, n_cross_validations=5, enable_ensembling=False, verbosity=logging.INFO, **time_series_settings) ws = Workspace.from_config() experiment = Experiment(ws, "forecasting_example") local_run = experiment.submit(automl_config, show_output=True) best_run, fitted_model = local_run.get_output() # In[ ]:
output['Subscription ID'] = ws.subscription_id output['Workspace'] = ws.name output['Resource Group'] = ws.resource_group output['Location'] = ws.location output['Project Directory'] = project_folder pd.set_option('display.max_colwidth', -1) pd.DataFrame(data=output, index=['']).T # get_data script does this now csv_file = "../data/" + experiment_name + ".csv" automl_settings = { "iteration_timeout_minutes": 10, "iterations": 30, "primary_metric": 'spearman_correlation', "preprocess": True, "verbosity": logging.DEBUG, "n_cross_validations": 5 } dflow = dprep.read_csv automl_config = AutoMLConfig(task='regression', debug_log='automl_errors.log', path=project_folder, compute_target=compute_target, data_script="get_data.py", **automl_settings) experiment = Experiment(ws, 'automl_remote') remote_run = experiment.submit(automl_config, show_output=True)
cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy']) conda_run_config.environment.python.conda_dependencies = cd automl_settings = { "iteration_timeout_minutes": 60, "iterations": 100, "n_cross_validations": 5, "primary_metric": 'AUC_weighted', "preprocess": True, "max_cores_per_iteration": 2 } automl_config = AutoMLConfig(task='classification', path=project_folder, run_configuration=conda_run_config, data_script=project_folder + "/get_data.py", **automl_settings) remote_run = experiment.submit(automl_config) # Canceling runs # # You can cancel ongoing remote runs using the *cancel()* and *cancel_iteration()* functions print(remote_run.id) time.sleep(180) # Cancel the ongoing experiment and stop scheduling new iterations remote_run.cancel()
automl_settings = { "name": "AutoML_Demo_Experiment_{0}".format(time.time()), "iteration_timeout_minutes": 10, "iterations": 20, "n_cross_validations": 5, "primary_metric": 'AUC_weighted', "preprocess": False, "max_concurrent_iterations": 10, "verbosity": logging.INFO } automl_config = AutoMLConfig( task='classification', debug_log='automl_errors.log', path=project_folder, compute_target=compute_target, run_configuration=run_config, X=X, ##use the remote uploaded data y=y, **automl_settings, ) # ## Train # # Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment. # # |Property|Description| # |-|-| # |**task**|classification or regression| # |**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>| # |**X**|(sparse) array-like, shape = [n_samples, n_features]| # |**y**|(sparse) array-like, shape = [n_samples, ], Multi-class targets.|
p = ESMLProject.get_project_from_env_command_line() # self-aware about its config sources p.describe() cli_auth = AzureCliAuthentication() ws = p.get_workspace_from_config(cli_auth) # Reads the current environment (dev,test, prod)config.json | Use CLI auth if MLOps p.init(ws) # Automapping from datalake to Azure ML datasets, prints status # TRAIN MODEL automl_performance_config = p.get_automl_performance_config() # 1)Get config aml_compute = p.get_training_aml_compute(ws) # 2)Get compute, for active environment label = "Y" train_6, validate_set_2, test_set_2 = p.split_gold_3(0.6,label) # 3) Auto register in Azure (M03_GOLD_TRAIN | M03_GOLD_VALIDATE | M03_GOLD_TEST) automl_config = AutoMLConfig(task = 'regression', # 4) primary_metric = azure_metric_regression.MAE, #Note: !MAPE experiment_exit_score = '0.208', # DEMO purpose compute_target = aml_compute, training_data = p.GoldTrain, label_column_name = label, **automl_performance_config ) train_as_pipeline = False best_run, fitted_model, experiment = None, None, None # Consistent return values from both AutoML ALTERNATIVES if (train_as_pipeline == True): print("train_as_pipeline") best_run, fitted_model, experiment = AutoMLFactory(p).train_pipeline(automl_config) #) 5 Train model else: print("train_as_run") best_run, fitted_model, experiment = AutoMLFactory(p).train_as_run(automl_config)
test = dataset.time_between(datetime(2017, 8, 8, 6), datetime(2017, 8, 10, 5)) test.to_pandas_dataframe().reset_index(drop=True).head(5) max_horizon = 12 automl_settings = { 'time_column_name': time_column_name, 'max_horizon': max_horizon, } automl_config = AutoMLConfig( task='forecasting', primary_metric='normalized_root_mean_squared_error', blacklist_models=['ExtremeRandomTrees', 'AutoArima', 'Prophet'], experiment_timeout_hours=0.3, training_data=train, label_column_name=target_column_name, compute_target=compute_target, enable_early_stopping=True, n_cross_validations=3, verbosity=logging.INFO, **automl_settings) remote_run = experiment.submit(automl_config, show_output=False) remote_run remote_run.wait_for_completion() best_run, fitted_model = remote_run.get_output() fitted_model.steps fitted_model.named_steps['timeseriestransformer'].get_engineered_feature_names( )
x = dprep.read_parquet_file(ds.path('model_data_x.parquet')) y = dprep.read_parquet_file(ds.path('model_data_y.parquet')).to_long( dprep.ColumnSelector(term='.*', use_regex=True)) project_folder = './automl' automl_config = AutoMLConfig( task="regression", iteration_timeout_minutes=10, iterations=10, primary_metric="r2_score", n_cross_validations=5, debug_log="automl.log", verbosity=logging.INFO, spark_context=sc, # noqa whitelist_models=[ "GradientBoosting", "DecisionTree", "RandomForest", "ExtremeRandomTrees", "LightGBM", ], blacklist_models=["ensemble"], X=x, y=y, path=project_folder, ) experiment = Experiment(ws, "host-ml-nt-ai-meetup") db_run = experiment.submit(automl_config, show_output=True)
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] location = request.json['location'] target_var = request.json['target_var'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df[target_var].values x_df = stock_dataset_df.drop([target_var], axis=1) print(y_df) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] best_model = request.json['best_model'] #n_cross_validations = request.json['n_cross_validations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations, "verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', path= 'D:\\Stock_Prediction\\AutoML_Azure\\python\\Flask_API_Azure\\log', #compute_target = 'Automlvm', X=x_df, y=y_df, **automl_settings, ) experiment = Experiment(ws, ExperimentName) remote_run = experiment.submit(automl_config, show_output=True) best_run, fitted_model = remote_run.get_output() #print(best_run) print(best_run.get_file_names()) #Register the model from datetime import date model = best_run.register_model(model_name=best_model + str(date.today()), model_path='outputs/model.pkl') print(model.name, model.id, model.version, sep='\t') children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) rundata.rename(column={ 0: "one", 1: "two", 2: "three", 3: "four", 4: "five", 5: "six", 6: "seven", 7: "right", 8: "nine", 9: "ten", }, inplace=True) rundata_toJson = rundata.to_json(orient='columns') print(rundata_toJson) return rundata_toJson except: return 'error'