def auto_train_model(ws, experiment_name, model_name, full_X, full_Y,training_set_percentage, training_target_accuracy): # start a training run by defining an experiment experiment = Experiment(ws, experiment_name) train_X, test_X, train_Y, test_Y = train_test_split(full_X, full_Y, train_size=training_set_percentage, random_state=42) train_Y_array = train_Y.values.flatten() # Configure the automated ML job # The model training is configured to run on the local machine # The values for all settings are documented at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train # Notice we no longer have to scale the input values, as Auto ML will try various data scaling approaches automatically Automl_config = AutoMLConfig(task = 'classification', primary_metric = 'accuracy', max_time_sec = 12000, iterations = 20, n_cross_validations = 3, exit_score = training_target_accuracy, blacklist_algos = ['kNN','LinearSVM'], X = train_X, y = train_Y_array, path='./04-automl/outputs') # Execute the job run = experiment.submit(Automl_config, show_output=True) # Get the run with the highest accuracy value. best_run, best_model = run.get_output() return (best_model, run, best_run)
def RunAutoML(): automl_settings = { "name": "AutoML_Demo_Experiment", "iteration_timeout_minutes": 15, "iterations": 3, "n_cross_validations": 5, "primary_metric": 'r2_score', "preprocess": False, "max_concurrent_iterations": 8, "verbosity": logging.INFO } subscription_id = request.json['subscription_id'] print(userData) print(userData[subscription_id]) #return "ok" try: automl_config = AutoMLConfig(task="classification", X=userData[subscription_id][1], y=userData[subscription_id][2], debug_log='automl_errors.log', preprocess=True, **automl_settings, ) experiment=Experiment(userData[subscription_id][0], 'automl_remote') run = experiment.submit(automl_config, show_output=True) run best_model,fitted_model = run.get_output() return 'ok' except: return 'error'
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] #location = request.json['location'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df['ActionTaken'].values x_df = stock_dataset_df.drop(['ActionTaken'], axis=1) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] #n_cross_validations = request.json['n_cross_validations'] try: automl_config = AutoMLConfig( task=tasks, X=x_df, y=y_df, iterations=iterations, iteration_timeout_minutes=iteration_timeout_minutes, primary_metric=primary_metric, #n_cross_validations=n_cross_validations, preprocess=True, ) experiment = Experiment(ws, ExperimentName) run = experiment.submit(config=automl_config, show_output=True) best_model,fitted_model = run.get_output() return 'ok' except: return 'error'
def main(train_path, pred_path, n_pred, dt, target, time_limit_min): df_train = pd.read_csv(train_path) df_train[dt] = pd.to_datetime(df_train[dt]) time_series_settings = { "time_column_name": dt, "max_horizon": n_pred, "target_lags": "auto", "target_rolling_window_size": "auto" } automl_config = AutoMLConfig(task="forecasting", training_data=df_train, label_column_name=target, n_cross_validations=5, max_cores_per_iteration=-1, path=os.environ["SCRATCH"], experiment_timeout_minutes=time_limit_min, ensemble_download_models_timeout_sec=3600, **time_series_settings) ws = Workspace.from_config() experiment = Experiment(ws, "experiment") best_run, fitted_model = experiment.submit(automl_config, show_output=True).get_output() print("Best pipeline:") try: ensemble = vars(fitted_model.steps[1][1])["_wrappedEnsemble"] print(ensemble.__class__) steps = ensemble.estimators_ except: steps = fitted_model.steps best_pipeline = "" for i, step in enumerate(steps): best_pipeline += f"{i}. {str(step)}\n" print(best_pipeline) pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', -1) print(fitted_model.named_steps["timeseriestransformer"]. get_engineered_feature_names()) featurization_summary = fitted_model.named_steps[ "timeseriestransformer"].get_featurization_summary() print(pd.DataFrame.from_records(featurization_summary)) x_pred = pd.date_range(df_train[dt].iloc[-1], periods=n_pred + 1, freq=pd.infer_freq(df_train[dt]))[1:] y_pred = fitted_model.forecast(forecast_destination=x_pred[-1])[0] # y_pred = fitted_model.forecast(pd.DataFrame({dt: x_pred}))[0] df_pred = pd.DataFrame({dt: x_pred, target: y_pred}) df_pred.to_csv(pred_path, index=False)
def train_model(data_file, random_seed): """Train the automl model.""" target = "utilization" df = pd.read_parquet(data_file) x = df.loc[:, [c for c in df if c != target]].values y = df[target].values project_folder = "./automl" automl_config = AutoMLConfig( task="regression", iteration_timeout_minutes=5, iterations=10, primary_metric="spearman_correlation", n_cross_validations=5, debug_log="automl.log", verbosity=logging.INFO, X=x, y=y, path=project_folder, ) load_dotenv(find_dotenv()) ws = Workspace( workspace_name=getenv("AML_WORKSPACE_NAME"), subscription_id=getenv("AML_SUBSCRIPTION_ID"), resource_group=getenv("AML_RESOURCE_GROUP"), ) experiment = Experiment(ws, getenv("AML_EXPERIMENT_NAME")) local_run = experiment.submit(automl_config, show_output=True) sub_runs = list(local_run.get_children()) best_run = None best_score = 0 for sub_run in sub_runs: props = sub_run.get_properties() if props["run_algorithm"] != "Ensemble": if float(props["score"]) > best_score: best_run = sub_run model_name = "Automl{}".format(str(uuid.uuid4()).replace("-", ""))[:20] best_run.register_model(model_name=model_name, model_path="outputs/model.pkl") # best_run, fitted_model = local_run.get_output() # local_run.register_model( # description="automl meetup best model" # ) print("Model name is {}".format(model_name))
def submit(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any],) -> List[Dict]: """Define what the form has to do after all required slots are filled""" task=tracker.get_slot('task') data=tracker.get_slot('data') column_name=tracker.get_slot('column_name') dispatcher.utter_message(template="utter_doing_task", task=tracker.get_slot('task'),data=tracker.get_slot('data'), column_name=tracker.get_slot('column_name')) # Load the workspace from the saved config file ws = Workspace.from_config() print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name)) df = pd.read_csv(data) train_data, test_data = train_test_split(df, test_size=0.1, random_state=42) label = column_name automl_config = AutoMLConfig(name='Automated ML Experiment', task= task, compute_target='local', training_data = train_data, validation_data = test_data, label_column_name= label, experiment_timeout_minutes=30, iterations=6, primary_metric = 'AUC_weighted', featurization='auto', ) automl_experiment = Experiment(ws, 'mslearn-diabetes-automl') automl_run = automl_experiment.submit(automl_config) best_run, fitted_model = automl_run.get_output() best_run_metrics = best_run.get_metrics() metric_list = [] for metric_name in best_run_metrics: metric = best_run_metrics[metric_name] metric_list.append((metric_name, metric)) return fitted_model, metric_list print("The best model pipeline for the data is") dispatcher.utter_message(text="The best model pipeline for the data is") print(model) dispatcher.utter_message(model) print("The different metrics are") dispatcher.utter_message(text="The different metrics are") print(metrics) dispatcher.utter_message(text=metrics) column_name=tracker.get_slot('column_name'))
def main(force_model_register: bool, skip_model_register: bool, submit_pipeline: bool, publish_pipeline: bool, experiment_name: str, debug_run: bool, dbx_cluster_name: str, aml_compute_name: str, input_dataset_name: str, validation_dataset_name: str): pipeline: Pipeline = create_pipeline( debug_run=debug_run, dbx_compute=dbx_cluster_name, aml_compute=aml_compute_name, input_dataset=input_dataset_name, validation_dataset=validation_dataset_name) pipeline.validate() if submit_pipeline and not publish_pipeline: exp = Experiment(WS, experiment_name) exp.submit(pipeline, pipeline_parameters={ "force_registration": str(force_model_register), "skip_registration": str(skip_model_register) }) if publish_pipeline: published_pipeline: PublishedPipeline = pipeline.publish( name="Driver Safety Pipeline", description="Training Pipeline for new driver safety model") if submit_pipeline: published_pipeline.submit(workspace=WS, experiment_name=experiment_name, pipeline_parameters={ "force_registration": str(force_model_register), "skip_registration": str(skip_model_register) }) sys.stdout.write(published_pipeline.id)
def run_pipeline(self, params): """ run_pipeline - Submit a pipeline job. :param Workspace ws: AML Workspace. :param Pipeline pipeline: AML pipeline. :param str pipeline_name: Directory of the source files. :param dict params: Pipeline parameteters. :returns: An AML experiment :rtype: Experiment """ # Submit the pipeline to be run exp = Experiment(self.ws, self.pipeline_name) exp_id = exp.submit(self.pipeline, pipeline_parameters=params) return exp_id
class _InnerAutomatedMLModel(): # Inner single model to be passed that wrapper can use to pass into MultiOutputRegressor def __init__(self, automl_config, workspace, experiment_name_prefix="aml_experiment"): self._show_output = automl_config._show_output self._workspace = workspace self._automl_config = automl_config self._experiment_name_prefix = experiment_name_prefix def get_params(self, deep=True): # Must be implemented for MultiOutputRegressor to view _InnerAutomatedMLModel # as an sklearn estimator return { 'workspace': self._workspace, 'automl_config': self._automl_config, 'experiment_name_prefix': self._experiment_name_prefix } def fit(self, X, y, sample_weight=None): # fit implementation for a single output model. # Create experiment for specified workspace automl_config = copy.deepcopy(self._automl_config) current_time = time.localtime() current_time_string = time.strftime('%y_%m_%d-%H_%M_%S', current_time) experiment_name = self._experiment_name_prefix + "_" + current_time_string self._experiment = Experiment(self._workspace, experiment_name) # Configure automl_config with training set information. automl_config.user_settings['X'] = X automl_config.user_settings['y'] = y automl_config.user_settings['sample_weight'] = sample_weight # Wait for remote run to complete, the set the model print("Experiment " + experiment_name + " has started.") local_run = self._experiment.submit(automl_config, show_output=self._show_output) print("Experiment " + experiment_name + " completed.") _, self._model = local_run.get_output() def predict(self, X): return self._model.predict(X) def predict_proba(self, X): return self._model.predict_proba(X)
"preprocess": True, "verbosity": logging.INFO, "n_cross_validations": 10 } # AutoML object for running experiment automated_ml_config = aml.AutoMLConfig(task='regression', debug_log='automated_ml_errors.log', path='./automated-ml-regression', X=trainingx_df.values, y=trainingy_df.values.flatten(), model_explainability=True, **automl_settings) # Submit experiment to get AutoMLRun object local_run = experiment.submit(automated_ml_config, show_output=True) # Best pipeline, Model from the best pipeline, in the bunch of runs(experiment) best_run, fitted_model = local_run.get_output() # this tells which algorithm was used in the model from best pipeline print(best_run.get_details()) # Predicting vlaues in a list for test data y_predict = fitted_model.predict(testx_df.values) # Printing the predictions to csv f = open('predict2014.csv', 'w') # Mean Absolute Error mae = mean_absolute_error(testy_df, y_predict) # Range of y in training data
with open("config/aml_config.json") as f: config = json.load(f) workspace_name = config["workspace_name"] resource_group = config["resource_group"] subscription_id = config["subscription_id"] workspace_region = config["location"] #Interactive Authentication ws = Workspace(workspace_name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, auth=cli_auth) local_run = RunConfiguration() local_run.environment.python.user_managed_dependencies = True ############# Experiement local-gbr-turbofan ###################### experiement_name = 'gbr-turbofan' exp = Experiment(workspace=ws, name=experiement_name) src = ScriptRunConfig(source_directory='compute/', script='01-train.py', run_config=local_run) run = exp.submit(src, tags={"build number": sys.argv[1]}) run.wait_for_completion(show_output=True)
from azureml.pipeline.core import PublishedPipeline from azureml.core.experiment import Experiment from azureml.core import Workspace workspace = Workspace.from_config() published_pipeline_id = "" is_debug = True debug_relay_connection_name = "test" if published_pipeline_id is None or published_pipeline_id == "": raise ValueError("Initialize published_pipeline_id") pipeline_parameters = {"is_debug": is_debug} if is_debug: if debug_relay_connection_name == "": raise ValueError("Hybrid connection name cannot be empty!") pipeline_parameters.update( {"debug_relay_connection_name": debug_relay_connection_name}) experiment = Experiment(workspace, "Pipeline_debug_experiment") published_pipeline = PublishedPipeline.get(workspace=workspace, id=published_pipeline_id) experiment.submit(published_pipeline, pipeline_parameters=pipeline_parameters)
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] location = request.json['location'] target_var = request.json['target_var'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df[target_var].values x_df = stock_dataset_df.drop([target_var], axis=1) print(y_df) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] best_model = request.json['best_model'] #n_cross_validations = request.json['n_cross_validations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations, "verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', path= 'D:\\Stock_Prediction\\AutoML_Azure\\python\\Flask_API_Azure\\log', #compute_target = 'Automlvm', X=x_df, y=y_df, **automl_settings, ) experiment = Experiment(ws, ExperimentName) remote_run = experiment.submit(automl_config, show_output=True) best_run, fitted_model = remote_run.get_output() #print(best_run) print(best_run.get_file_names()) #Register the model from datetime import date model = best_run.register_model(model_name=best_model + str(date.today()), model_path='outputs/model.pkl') print(model.name, model.id, model.version, sep='\t') children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) rundata.rename(column={ 0: "one", 1: "two", 2: "three", 3: "four", 4: "five", 5: "six", 6: "seven", 7: "right", 8: "nine", 9: "ten", }, inplace=True) rundata_toJson = rundata.to_json(orient='columns') print(rundata_toJson) return rundata_toJson except: return 'error'
print("Training the model...") # configure Auto ML automl_config = AutoMLConfig(task='classification', debug_log='automl_errors.log', primary_metric='AUC_weighted', iteration_timeout_minutes=2, iterations=20, n_cross_validations=5, preprocess=False, max_concurrent_iterations=5, verbosity=logging.INFO, path=project_folder, compute_target=batch_ai_compute, data_script=project_folder + "/get_data.py") remote_run = experiment.submit(automl_config, show_output=False) remote_run.wait_for_completion(show_output=True) # Retrieve All Child Runs print("Retrieving All Child Runs") children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1)
compute_target=aml_compute, path=os.path.realpath(scripts_folder), data_script='get_data.py', **automl_settings) train_step = AutoMLStep(name='AutoML_Classification', automl_config=automl_config, inputs=[output_split_train_x, output_split_train_y], allow_reuse=True) print("Building pipeline") pipeline_steps = [train_step] pipeline = Pipeline(workspace=ws, steps=pipeline_steps) print("Submitting pipeline") pipeline_run = experiment.submit(pipeline, regenerate_outputs=False) print("Waiting for pipeline completion") pipeline_run.wait_for_completion() def get_download_path(download_path, output_name): output_folder = os.listdir(download_path + '/azureml')[0] path = download_path + '/azureml/' + output_folder + '/' + output_name return path def fetch_df(step, output_name): output_data = step.get_output_data(output_name) download_path = './outputs/' + output_name
def run(workspace, config, args): compute_target_name = config['train']['compute_target_name'] data_folder = config['train']['data_folder'] try: compute_target = ComputeTarget(workspace=workspace, name=compute_target_name) print('found existing:', compute_target.name) except ComputeTargetException: print('creating new.') compute_config = AmlCompute.provisioning_configuration( vm_size=config['train']['vm_size'], min_nodes=0, max_nodes=1) compute_target = ComputeTarget.create(workspace, compute_target_name, compute_config) compute_target.wait_for_completion(show_output=True) # ds = Datastore.register_azure_blob_container( # workspace, # datastore_name=config['train']['datastore_name'], # account_name=config['train']['account_name'], # account_key=config['train']['account_key'], # container_name=config['train']['container_name'], # overwrite=True) # # # # Upload local "data" folder (incl. files) as "tfdata" folder # ds.upload( # src_dir=config['train']['local_directory'], # target_path=data_folder, # overwrite=True) ds = Datastore.get(workspace, datastore_name=config['train']['datastore_name']) # generate data reference configuration dr_conf = DataReferenceConfiguration( datastore_name=ds.name, path_on_datastore=data_folder, mode='mount' ) # set 'download' if you copy all files instead of mounting run_config = RunConfiguration(framework="python", conda_dependencies=CondaDependencies.create( conda_packages=ast.literal_eval( config['train']['conda_packages']))) run_config.target = compute_target.name run_config.data_references = {ds.name: dr_conf} run_config.environment.docker.enabled = True # run_config.environment.docker.gpu_support = True run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE src = ScriptRunConfig( source_directory='./script', script='train.py', run_config=run_config, arguments=[ '--datadir', str(ds.as_mount()), '--step', args.step, '--train_on', args.train_on, '--fold', args.fold, '--epochs', args.epochs, '--experiment', args.experiment, '--reference', args.reference, '--batchsize', args.batchsize, '--optimizertype', args.optimizertype, '--convrnn_filters', args.convrnn_filters, '--learning_rate', args.learning_rate, '--pix250m', args.pix250m ]) # exp = Experiment(workspace=ws, name='test20181210-09') exp = Experiment(workspace=workspace, name=config['train']['experiment_name']) run = exp.submit(config=src) run.wait_for_completion(show_output=True)
# # - Create an experiment to run. # - Submit the experiment. # - Wait for the run to complete. # ### Create the experiment # In[ ]: experiment = Experiment(ws, experiment_name) # ### Submit the experiment # In[ ]: run = experiment.submit(keras_est) # Wait for the run to complete by executing the following cell. Note that this process will perform the following: # - Build and deploy the container to Azure Machine Learning compute (~8 minutes) # - Execute the training script (~2 minutes) # # If you change only the training script and re-submit, it will run faster the second time because the necessary container is already prepared so the time requried is just that for executing the training script. # In[ ]: run.wait_for_completion(show_output=True) # ## Download the model files from the run # In the training script, the Keras model is saved into two files, model.json and model.h5, in the outputs/models folder on the GPU cluster AmlCompute node. Azure ML automatically uploaded anything written in the ./outputs folder into run history file store. Subsequently, we can use the run object to download the model files. They are under the the outputs/model folder in the run history file store, and are downloaded into a local folder named model.
def peptide_identification(args): print(datetime.now(), ': Peptid identification starts...') print('Settings: ') print(args) # PLATO setting subclusterCount = args.subclusterCount spy = args.spy spy_portion = args.spy_portion RN = args.RN rnd_all = args.rnd_all # If random method, include all decoys rnd_portion = args.rnd_portion # If random method, include rnd.portion of positive set, default 1: pos set = neg set replicates_cnt = args.replicates_cnt include_label = args.include_label AML_preprocess = args.AML_preprocess output_folder = args.output_folder # AutoML parameter setting autoML_best_model_selection = args.autoML_best_model_selection autoML_iterations = args.autoML_iterations metric = args.metric # Other metrics: azureml.train.automl.utilities.get_primary_metrics('classification') cv_fold = args.cv_fold # Input, output file_name = args.sample_name input_path = args.input_folder output_path = output_folder + '/' + file_name log_file = output_path + '_autoML_errors_log.html' # Instantiate AutoML config and create an experiment in autoML workspace ws = Workspace.from_config() experiment_name = file_name experiment = Experiment(ws, experiment_name) print(datetime.now(), ': Assigned experiment ' + experiment_name + ' on Azure portal ') output = {} output['SDK version'] = azureml.core.VERSION output['Workspace Name'] = ws.name output['Resource Group'] = ws.resource_group output['Location'] = ws.location outputDf = pd.DataFrame(data=output, index=['']) print(outputDf) print(datetime.now(), ': Reading inputs') # Read POSITIVES and ALL inputs positives_path = glob.glob(input_path + file_name + '*POSITIVES*') raw_positives = pd.read_csv(positives_path[0], sep='\t') if AML_preprocess == True: all_path = glob.glob(input_path + file_name + '-ALL.txt') raw_all = pd.read_csv(all_path[0], sep='\t') # Extract new features # First and last three amino acides of peptide sequences as features - If NA then B category raw_all['Peptide'] = raw_all.Peptide.str.replace( r'([\(\[]).*?([\)\]])', r'B', regex=True) raw_all['P1'] = raw_all['Peptide'].str[0] raw_all['P2'] = raw_all['Peptide'].str[2] raw_all['P3'] = raw_all['Peptide'].str[3] raw_all['P4'] = raw_all['Peptide'].str[-4] raw_all['P5'] = raw_all['Peptide'].str[-3] raw_all['P6'] = raw_all['Peptide'].str[-1] else: all_path = glob.glob(input_path + file_name + '_percolator_feature.txt') raw_all = pd.read_csv(all_path[0], sep='\t') raw_all['Class'] = 0 # Make positive and test set test_data = raw_all.drop(['ScanNr', 'Proteins'], axis=1) positive_set = pd.merge(left=pd.DataFrame(raw_positives['SpecId']), right=pd.DataFrame(test_data), how='left', left_on='SpecId', right_on='SpecId') positive_set['Class'] = 1 # Remove decoys in positive set, if there is any decoys_in_positive_idx = positive_set.index[positive_set['Label'] == -1].tolist() positive_set = positive_set[positive_set['Label'] != -1] # Dataframe to store predictions all_predictions = pd.DataFrame({ 'SpecId': list(test_data['SpecId']), 'Peptide': list(test_data['Peptide']), 'Label': list(test_data['Label']) }) prediction_summary = all_predictions # Prepare test set for modeling y_test = test_data['Class'] if include_label == True: X_test = test_data.drop(['SpecId', 'Peptide', 'Class'], axis=1) else: X_test = test_data.drop(['SpecId', 'Peptide', 'Label', 'Class'], axis=1) # Prepare positive set for modeling positive_set_idx = [ test_data['SpecId'].tolist().index(x) for x in positive_set['SpecId'].tolist() if x in test_data['SpecId'].tolist() ] # Used to create the negative set decoys_idx = np.setdiff1d( test_data.index[test_data['Label'] == -1].tolist(), decoys_in_positive_idx).tolist() global gower_dist_avg if RN == True: if os.path.exists(input_path + file_name + 'gower_dist_avg.npy') == False: print(datetime.now(), ': Calculating Gower distance') gower_dist = gower.gower_matrix(test_data) selected_rows = gower_dist[positive_set_idx] gower_dist_avg = np.mean(selected_rows, axis=0) print(datetime.now(), ': Saving Gower distance matrix') np.save(input_path + '/' + file_name + 'gower_dist_avg.npy', gower_dist_avg) # save else: print(datetime.now(), ': Loading Gower distance matrix from ', input_path + file_name + 'gower_dist_avg.npy') gower_dist_avg = np.load(input_path + file_name + 'gower_dist_avg.npy') # load if spy == True: all_spies = pd.DataFrame() ''' Create train set by concatinating positive and negative set, build model(s) using autoML and store predictions based on the best model ''' for rep in range(0, replicates_cnt): print(datetime.now(), ': Replicate #', rep + 1) if spy == True: # Exclude spy_portion of training data to be the spies positive_set = positive_set.sample(n=len(positive_set), random_state=rep * 100).reset_index(drop=True) spySet_size = round(len(positive_set) * spy_portion) spies_ID = positive_set.loc[1:spySet_size, ['SpecId']] positive_set_wSpy = positive_set.iloc[spySet_size + 1:len(positive_set)] if RN == False: if rnd_all == True: # Negative set includes all decoys negative_set_idx = decoys_idx else: # Negative set idx includes rnd_portion times of |positive_set| indecies random.seed(rep) random.shuffle(decoys_idx) negative_set_idx = decoys_idx[0:rnd_portion * len(positive_set)] else: print(datetime.now(), ': Starts estimating RNs') negative_set_idx = reliable_negative(test_data, positive_set, subclusterCount, rep) print(datetime.now(), ': Ends estimating RNs') negative_set = test_data.iloc[negative_set_idx] if spy == True: train_data = pd.concat([positive_set_wSpy, negative_set], axis=0) else: train_data = pd.concat([positive_set, negative_set], axis=0) y_train = train_data['Class'] if include_label == True: X_train = train_data.drop(['SpecId', 'Peptide', 'Class'], axis=1) else: X_train = train_data.drop(['SpecId', 'Peptide', 'Class', 'Label'], axis=1) print('Training set size:', len(y_train), '\nTest set size:', len(y_test)) automl_config = AutoMLConfig(task='classification', debug_log=log_file, primary_metric=metric, iteration_timeout_minutes=200, iterations=autoML_iterations, verbosity=logging.INFO, preprocess=AML_preprocess, X=X_train, y=y_train, n_cross_validations=cv_fold, model_explainability=True) print(datetime.now(), ': modeling replicate #' + str(rep + 1) + '...') local_run = experiment.submit(automl_config, show_output=True) if autoML_best_model_selection == False: # Retrieve the Best Model based on bunch of metrics children = list(local_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) tmp = rundata.T.sort_values([ 'AUC_weighted', 'f1_score_weighted', 'precision_score_weighted', 'recall_score_weighted', 'weighted_accuracy' ], ascending=False) rundata = tmp.sort_values('log_loss', ascending=True).T best_run_iteration = rundata.columns.values[0] rundata.to_csv(output_path + '_metrics_list_' + str(rep) + '.txt') best_run, fitted_model = local_run.get_output( iteration=best_run_iteration) else: best_run, fitted_model = local_run.get_output() print('Best run: ', best_run) print(datetime.now(), ': Saving best model and predictions') # Save the best model, prediction value and probability modelname = output_path + '_model_' + str(rep) + '.sav' joblib.dump(fitted_model, modelname) y_pred_val = fitted_model.predict(X_test) y_pred_prob = fitted_model.predict_proba(X_test) # Add the results of the replicate to all predictions table all_predictions['pred_rep' + str(rep)] = list(y_pred_val) all_predictions['prob_rep' + str(rep)] = list( [item[1] for item in y_pred_prob]) # Overwrite prediction values based on the spies cutoff if spy == True: threshold = min( pd.merge(spies_ID, all_predictions, on='SpecId')['prob_rep' + str(rep)]) all_predictions['pred_rep' + str(rep)] = np.where( all_predictions['prob_rep' + str(rep)] >= threshold, 1, 0) all_spies['SpecId' + str(rep)] = spies_ID['SpecId'] all_spies['Prob_rep' + str(rep)] = list( pd.merge(spies_ID, all_predictions, on=['SpecId'])['prob_rep' + str(rep)]) print(datetime.now(), ': Replicate #' + str(rep + 1) + ' processed!') all_predictions.to_csv(output_path + '_all_predictions.csv', index=False) if spy == True: all_spies.to_csv(output_path + '_all_spies.csv', index=False) print(datetime.now(), ': Generate prediction summary of all replicates') pred_col_indecies = [ col for col in all_predictions.columns if 'pred' in col ] prob_col_indecies = [ col for col in all_predictions.columns if 'prob' in col ] prediction_summary['Std'] = all_predictions[prob_col_indecies].std( skipna=True, axis=1) prediction_summary['Min'] = all_predictions[prob_col_indecies].min( skipna=True, axis=1) prediction_summary['Max'] = all_predictions[prob_col_indecies].max( skipna=True, axis=1) prediction_summary['Avg'] = all_predictions[prob_col_indecies].mean( skipna=True, axis=1) prediction_summary['Median'] = all_predictions[prob_col_indecies].median( skipna=True, axis=1) prediction_summary['Vote'] = all_predictions[pred_col_indecies].sum( skipna=True, axis=1) prediction_summary.to_csv(output_path + '_prediction_summary.txt', sep='\t', index=False) # Feature importance print(datetime.now(), ': Output feature importance of the best run') client = ExplanationClient.from_run(best_run) raw_explanations = client.download_model_explanation( top_k=len(X_test.columns)) print('Raw feature importance') print(raw_explanations.get_feature_importance_dict()) d = raw_explanations.get_feature_importance_dict() raw_feature_importance = pd.DataFrame(list(d.items())) raw_feature_importance.to_csv(output_path + '_raw_feature_importance.csv', index=False) # Engineered engineered_explanations = client.download_model_explanation( top_k=len(X_test.columns)) print('Engineered feature importance') print(engineered_explanations.get_feature_importance_dict()) d = engineered_explanations.get_feature_importance_dict() engineered_feature_importance = pd.DataFrame(list(d.items())) engineered_feature_importance.to_csv(output_path + '_engineered_feature_importance.csv', index=False) now = datetime.now() print(datetime.now(), ': Program end')
################################ #%% # Step 14 - Create estimator ############################# from azureml.train.estimator import Estimator script_params = { '--data-folder': ds.as_mount(), '--training-set-percentage': 0.3 } est_config = Estimator(source_directory='./training', script_params=script_params, compute_target=compute_target, entry_script='train.py', conda_packages=['scikit-learn', 'pandas']) #%% # Step 15 - Execute the estimator job ##################################### run = exp.submit(config=est_config) run # Poll for job status run.wait_for_completion( show_output=True) # value of True will display a verbose, streaming log # Examine the recorded metrics from the run print(run.get_metrics())
automl_settings = { "iteration_timeout_minutes": 60, "iterations": 100, "n_cross_validations": 5, "primary_metric": 'AUC_weighted', "preprocess": True, "max_cores_per_iteration": 2 } automl_config = AutoMLConfig(task='classification', path=project_folder, run_configuration=conda_run_config, data_script=project_folder + "/get_data.py", **automl_settings) remote_run = experiment.submit(automl_config) # Canceling runs # # You can cancel ongoing remote runs using the *cancel()* and *cancel_iteration()* functions print(remote_run.id) time.sleep(180) # Cancel the ongoing experiment and stop scheduling new iterations remote_run.cancel() print('run cancelled') # Wait for the run to complete. It should complete soon because it has been canceled.
def RunAutoMLForecast(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] location = request.json['location'] target_var = request.json['target_var'] cluster_name = request.json['cluster_name'] best_model = request.json['best_model'] time_column_name = request.json['time_column_name'] max_horizon = request.json['max_horizon'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') compute_target = AmlCompute(ws, cluster_name) print('Found existing AML compute context.') dataset_name = file_name time_column_name = time_column_name # Get a dataset by name dataset = Dataset.get_by_name(workspace=ws, name=dataset_name).with_timestamp_columns( fine_grain_timestamp=time_column_name) print(dataset) #df_ts = Dataset.Tabular.from_delimited_files(df_ts) dataset.to_pandas_dataframe().describe() dataset.take(3).to_pandas_dataframe() print(dataset) #y_df = df_ts[target_var].values #x_df = df_ts.drop([target_var], axis=1) print('file successfully recieved.') #stock_dataset_df.head() # create a new RunConfig object conda_run_config = RunConfiguration(framework="python") conda_run_config.environment.docker.enabled = True conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy', 'py-xgboost<=0.80']) conda_run_config.environment.python.conda_dependencies = cd print('run config is ready') ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] #max_concurrent_iterations = request.json['max_concurrent_iterations'] automl_settings = { 'time_column_name': time_column_name, 'max_horizon': max_horizon, "iterations": iterations, } automl_config = AutoMLConfig( task=tasks, primary_metric=primary_metric, #blacklist_models = ['ExtremeRandomTrees', 'AutoArima', 'Prophet'], experiment_timeout_minutes=iteration_timeout_minutes, training_data=dataset, label_column_name=target_var, compute_target=compute_target, enable_early_stopping=True, n_cross_validations=n_cross_validations, #verbosity=logging.INFO, **automl_settings) print("AutoML config created.") experiment = Experiment(ws, ExperimentName) remote_run = experiment.submit(automl_config, show_output=True) children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(axis=1, by=primary_metric) rundata.rename(columns={ 0: "one", 1: "two", 2: "three", 3: "four", 4: "five", 5: "six", 6: "seven", 7: "eight", 8: "nine", 9: "ten", }, inplace=True) iterations_toJson = rundata.to_json(orient='columns') print(iterations_toJson) best_run, fitted_model = remote_run.get_output() #best_run_toJson = best_run.get_metrics() #dict = {} #dict['iterations_toJson'] = iterations_toJson #dict['best_run_toJson'] = best_run_toJson #print(best_run.get_file_names()) #Register the model #from datetime import date model = remote_run.register_model(model_name=best_model, description='AutoML Model') print(model.name, model.id, model.version, sep='\t') best_model = model.name best_model var1 = "@" var2 = var1 + best_model return '{} {}'.format(iterations_toJson, var2)
def RunAutoMLReg(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] location = request.json['location'] target_var = request.json['target_var'] cluster_name = request.json['cluster_name'] best_model = request.json['best_model'] #best_model = request.json['best_model'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') #compute_target = AmlCompute(ws, cluster_name) compute_target = ws.compute_targets[cluster_name] print('Found existing AML compute context.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) #stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') #stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) X = df.drop_columns(columns=[target_var]) y = df.keep_columns(columns=[target_var], validate=True) #y_df = stock_dataset_df[target_var].values #x_df = stock_dataset_df.drop([target_var], axis=1) print(y) # create a new RunConfig object conda_run_config = RunConfiguration(framework="python") conda_run_config.environment.docker.enabled = True conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy', 'py-xgboost<=0.90']) conda_run_config.environment.python.conda_dependencies = cd print('run config is ready') ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "featurization": 'auto', "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations #"verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', blacklist_models=['XGBoost'], #path=os.getcwd(), compute_target=compute_target, #run_configuration=conda_run_config, X=X, y=y, **automl_settings, ) experiment = Experiment(ws, ExperimentName) remote_run = experiment.submit(automl_config, show_output=True) remote_run.flush(timeout_seconds=400) children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(axis=1, by=primary_metric) rundata = rundata.drop([ 'mean_absolute_percentage_error', 'normalized_median_absolute_error', 'normalized_root_mean_squared_log_error', 'root_mean_squared_log_error' ]) rundata.rename(columns={ 0: "one", 1: "two", 2: "three", 3: "four", 4: "five", 5: "six", 6: "seven", 7: "eight", 8: "nine", 9: "ten", }, inplace=True) iterations_toJson = rundata.to_json(orient='columns') print(iterations_toJson) best_run, fitted_model = remote_run.get_output() best_run_toJson = best_run.get_metrics() cwd = 'D:/DCSAIAUTOML/BestModels/Azure' best_model_name = best_run.name model = remote_run.register_model(description=best_model) print(model.name, model.id, model.version, sep='\t') model_path = os.path.join(cwd, best_model, best_model_name) print(model_path) #print("Model DownLoad Complete") #model = Model(workspace=ws, name=model.name) #model.download_files(target_dir=model_path) #dict = {} #dict['iterations_toJson'] = iterations_toJson #dict['best_run_toJson'] = best_run_toJson #print(best_run.get_file_names()) #Register the model #from datetime import date best_model_id = best_run.name var1 = "@" var2 = var1 + best_model_id Reg_model_name = model.name var4 = var1 + Reg_model_name best_run.flush(timeout_seconds=3600) best_run.download_files(output_directory=model_path) # importing required modules #import shutil #output_path = os.path.join(model_path, best_model_id) #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best" #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best\\my_azure_best" #shutil.make_archive(model_path,'zip',model_path) #zipf = zipfile.ZipFile(best_model_id+'.zip', 'w', zipfile.ZIP_DEFLATED) #for root, dirs, files in os.walk(model_path): #for file in files: #zipf.write(os.path.join(root, file)) #def zipdir(path, ziph): # ziph is zipfile handle #import os #for root, dirs, files in os.walk(path): #for file in files: #ziph.write(os.path.join(root, file)) #zipdir(model_path, zipf) #remote_run.clean_preprocessor_cache() print("ready to return") var5 = "no exception" return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1, var5) #return iterations_toJson except Exception as e: error_statement = str(e) print("Error statement: ", error_statement) model_path1 = os.path.join(model_path, 'outputs') file_name = 'model.pkl' print("in exception: ", model_path1) src = 'D:\\Final Script_dev' full_file_name = os.path.join(src, file_name) import shutil #remote_run.download_file('model.pkl', output_file_path=model_path1) if os.path.isfile(full_file_name): shutil.copy(full_file_name, model_path1) return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1, error_statement)
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] #location = request.json['location'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df['ActionTaken'].values x_df = stock_dataset_df.drop(['ActionTaken'], axis=1) print(y_df) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] #n_cross_validations = request.json['n_cross_validations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations, "verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', path=os.getcwd(), #compute_target = 'Automlvm', X=x_df, y=y_df, **automl_settings, ) experiment = Experiment(ws, 'automl_local_v2') remote_run = experiment.submit(automl_config, show_output=True) children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) rundata_toJson = rundata.to_json(orient='columns') return rundata_toJson except: return 'error'
def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') # interactive_auth = InteractiveLoginAuthentication(tenant_id="b88f1ff4-e3ab-4adb-83e6-4ea99d41c665") sp = ServicePrincipalAuthentication(tenant_id='b88f1ff4-e3ab-4adb-83e6-4ea99d41c665', service_principal_id='2e90efa1-d53f-45d4-96d8-7adde8a02cdc', service_principal_password='******' ) query = req.params.get('query') if not query: try: req_body = req.get_json() except ValueError: pass else: query = req_body.get('query') if query == 'run': try: ws = Workspace.get(name="vrd-ml", subscription_id="b9301f45-7da5-41f6-9125-1331de94f262", resource_group="vrd-dev-asia", auth=sp ) compute_name = 'automl-compute' if compute_name in ws.compute_targets: compute_target = ws.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: print('found compute target. just use it. ' + compute_name) else: print('creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_D2_V2', min_nodes = 0, max_nodes = 4) compute_target = ComputeTarget.create(ws, compute_name, provisioning_config) compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) dataset = Dataset.get_by_name(ws, name='datasetfunc') train_data, test_data = dataset.random_split(percentage=0.8, seed=223) label = "ERP" automl_config = AutoMLConfig(task = 'regression', compute_target = compute_name, training_data = train_data, label_column_name = label, validation_data = test_data, # n_cross_validations= 3, primary_metric= 'r2_score', enable_early_stopping= True, experiment_timeout_hours= 0.3, max_concurrent_iterations= 4, max_cores_per_iteration= -1, verbosity= logging.INFO ) experiment_name = 'expfunc' experiment = Experiment(workspace = ws, name = experiment_name) run = experiment.submit(automl_config, show_output = True) run run.wait_for_completion() except ValueError: pass return func.HttpResponse("AutoML Run Completed") else: return func.HttpResponse( "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.", status_code=200 )
max_concurrent_iterations=9, max_cores_per_iteration=-1, forecasting_parameters=forecasting_parameters, ) # COMMAND ---------- # DBTITLE 1,Train # submit a new training run from azureml.train.automl.run import AutoMLRun try: if new_training == "True": print("New Training Run") remote_run = experiment.submit( automl_config, show_output=False) # Story No. 3018 modified Mukesh Dutta 9/3/2021 else: # If you need to retrieve a run that already started, use the following code print("Existing Training Run") remote_run = AutoMLRun(experiment=experiment, run_id=runid) except Exception as error: print(error) log_error("{} {}".format(notebook, error)) #log error in sentry #raise dbutils.notebook.exit(error) #raise the exception raise error #raise the exception remote_run # COMMAND ----------
cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config) cpu_cluster.wait_for_completion(show_output=True) aml_run_config = RunConfiguration() aml_run_config.target = cpu_cluster_name # AmlCompute is created in the same region as your workspace # Set the VM size for AmlCompute from the list of supported_vmsizes aml_run_config.amlcompute.vm_size = 'STANDARD_D2_V2' aml_run_config.amlcompute._cluster_max_node_count = 2 # Specify CondaDependencies obj, add necessary packages aml_run_config.environment.python.conda_dependencies = CondaDependencies( "./../localscripts/turbofan.yml") #CondaDependencies.create(conda_packages=['scikit-learn']) ############# Experiement remote-gbr-turbofan ###################### experiement_name = 'gbr-turbofan' exp = Experiment(workspace=ws, name=experiement_name) src = ScriptRunConfig(source_directory='./', script='01-train.py', run_config=aml_run_config) run = exp.submit(src, tags={"python version": sys.version[0:6]}) run.wait_for_completion(show_output=True)
# | # # Automated machine learning trains multiple machine learning pipelines. Each pipelines training is known as an iteration. # * You can specify a maximum number of iterations using the `iterations` parameter. # * You can specify a maximum time for the run using the `experiment_timeout_minutes` parameter. # * If you specify neither the `iterations` nor the `experiment_timeout_minutes`, automated ML keeps running iterations while it continues to see improvements in the scores. # # The following example doesn't specify `iterations` or `experiment_timeout_minutes` and so runs until the scores stop improving. # # In[43]: ## run remote train from azureml.core.experiment import Experiment experiment = Experiment(ws, 'automl_remote') remote_run = experiment.submit(automl_config, show_output=True) # In[4]: # configure experimetn to run on LOCALE cluster automl_config = AutoMLConfig(task='classification', primary_metric='AUC_weighted', X=X_train, y=y_train, n_cross_validations=3) # Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while. # In this example, we specify `show_output = True` to print currently running iterations to the console. # In[5]:
import azureml.core from azureml.core import Workspace from azureml.core.run import Run from azureml.core.experiment import Experiment from azureml.train.dnn import PyTorch subscription_id = "" # The ID of the Azure Subscription resource_group = "AdvanceAnalytics.Aml.Experiments" # Name of a logical resource group workspace_name = "aa-ml-aml-workspace" # The name of the workspace to look for or to create workspace_region = 'eastus' # Location of the workspace computetarget_vm= 'Standard_NC6' # Size of the VM to use experiment_name = 'azureml-gpubenchmark' train_script = 'train_and_track.py' ws = Workspace.create( name = workspace_name, subscription_id = subscription_id, resource_group = resource_group, location = workspace_region, exist_ok = True) src = PyTorch(source_directory = r'.\fastai', compute_target='amlcompute', vm_size=computetarget_vm, entry_script = train_script, use_gpu = True, pip_packages = ['fastai', "azureml-sdk"]) experiment = Experiment(workspace=ws, name=experiment_name) run = experiment.submit(src) run.wait_for_completion(show_output = True)
whitelist_models=[ "GradientBoosting", "DecisionTree", "RandomForest", "ExtremeRandomTrees", "LightGBM", ], blacklist_models=["ensemble"], X=x, y=y, path=project_folder, ) experiment = Experiment(ws, "host-ml-nt-ai-meetup") db_run = experiment.submit(automl_config, show_output=True) sub_runs = list(db_run.get_children()) best_run = None best_score = 0 for sub_run in sub_runs: props = sub_run.get_properties() if props["run_algorithm"] != "Ensemble": if float(props["score"]) > best_score: best_run = sub_run model_name = "Automl{}".format(str(uuid.uuid4()).replace("-", ""))[:20] best_run.register_model(model_name=model_name, model_path="outputs/model.pkl") # best_run, fitted_model = local_run.get_output()
#spark_context=sc, training_data=training_data, label_column_name=label, **automl_settings, featurization='auto', experiment_exit_score=.98) # COMMAND ---------- # MAGIC %md Submit the experiment to Automated ML service. This step can take longer depending on the settings. AutoML will give us updates as models are trained and evaluated by the metric we specified above. The information from each ML model training will be stored in the Experiment section of the Azure ML Workspace in Azure Portal. # COMMAND ---------- # DBTITLE 1,Submit run to your Databricks cluster local_run = experiment.submit( automl_config, show_output=True ) # for higher runs please use show_output=False and use the below # COMMAND ---------- # DBTITLE 1,Monitor progress in the portal displayHTML( "<a href={} target='_blank'>Your experiment in Azure Portal: {}</a>". format(local_run.get_portal_url(), local_run.id)) # COMMAND ---------- # MAGIC %md **Run After AutoML Experiement is Complete** # COMMAND ----------