def init(): global g_tf_sess, probabilities, label_dict, input_images parser = argparse.ArgumentParser(description="Start a tensorflow model serving") parser.add_argument('--model_name', dest="model_name", required=True) parser.add_argument('--labels_name', dest="labels_name", required=True) args, _ = parser.parse_known_args() workspace = Run.get_context(allow_offline=False).experiment.workspace label_ds = Dataset.get_by_name(workspace=workspace, name=args.labels_name) label_ds.download(target_path='.', overwrite=True) label_dict = get_class_label_dict() classes_num = len(label_dict) with slim.arg_scope(inception_v3.inception_v3_arg_scope()): input_images = tf.placeholder(tf.float32, [1, image_size, image_size, num_channel]) logits, _ = inception_v3.inception_v3(input_images, num_classes=classes_num, is_training=False) probabilities = tf.argmax(logits, 1) config = tf.ConfigProto() config.gpu_options.allow_growth = True g_tf_sess = tf.Session(config=config) g_tf_sess.run(tf.global_variables_initializer()) g_tf_sess.run(tf.local_variables_initializer()) model_path = Model.get_model_path(args.model_name) saver = tf.train.Saver() saver.restore(g_tf_sess, model_path)
def download_file(self, dataset_name, dataset_type='pandas', *args, **kwargs): """ Downloads file from file storage :param dataset_name: name of the dataset :param dataset_type: name of the dataset type :param args: other arguments containing additional information :param kwargs: other keyword arguments containing additional information """ try: run = Run.get_context(allow_offline=False) except RunEnvironmentException as e: raise SkipServiceException('Skip AmlModelStorageHandler handler') ws = run.experiment.workspace ds = Dataset.get_by_name(workspace=ws, name=dataset_name) # Get a Dataset by name if dataset_type == 'spark': df = ds.to_spark_dataframe( ) # Load a Tabular Dataset into pandas DataFrame else: df = ds.to_pandas_dataframe( ) # Load a Tabular Dataset into pandas DataFrame return df
def main(): parser = argparse.ArgumentParser() #params = {"objective": "binary:logistic", "max_depth": 3} run = Run.get_context() parser.add_argument('--num_boost_round', type=int, default=5, help="Number of boosting rounds") parser.add_argument('--max_depth', type=int, default=3, help="Maximum depth of the trees to be boosted") parser.add_argument('--learning_rate', type=float, default=0.001, help="Learning rate, xgb's eta") parser.add_argument('--gamma', type=float, default=0.1, help="Minimum loss reduction") parser.add_argument('--reg_lambda', type=float, default=0.1, help="L2 regularization term on weights") parser.add_argument('--scale_pos_weight', type=float, default=1.0, help="Balancing of positive and negative weights") args = parser.parse_args() workspace = run.experiment.workspace key = "heart-failure" #"Heart failure" description_text = "Heart failure dataset for udacity capstone" if key in workspace.datasets.keys(): found = True dataset = Dataset.get_by_name(workspace, name='heart-failure') df = dataset.to_pandas_dataframe() X, y = df.loc[:, ~df.columns.isin(["DEATH_EVENT"])], df.loc[:, "DEATH_EVENT"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) params = {"scale_pos_weight": np.float(args.scale_pos_weight), "reg_lambda": np.float(args.reg_lambda), "gamma": np.float(args.gamma), "learning_rate": np.float(args.learning_rate), "max_depth": np.int(args.max_depth), "num_boost_round": np.int(args.num_boost_round)} run.log("scale_pos_weigth:", np.float(args.scale_pos_weight)) run.log("Max depth:", np.int(args.max_depth)) run.log("Learning rate:", np.int(args.learning_rate)) run.log("Boosting rounds:", np.int(args.num_boost_round)) run.log("Gamma (minimum loss reduction):", np.int(args.learning_rate)) run.log("Lambda (L2 regularization):", np.int(args.reg_lambda)) xgb_model = xgb.XGBClassifier(objective="binary:logistic", n_estimators = np.int(args.num_boost_round), max_depth = np.int(args.max_depth), learning_rate=np.float(args.learning_rate), gamma= np.float(args.gamma), reg_lambda=np.float(args.reg_lambda), scale_pos_weight=np.float(args.scale_pos_weight), random_state=123) xgb_model.fit(X_train, y_train) y_pred = xgb_model.predict(X_test) fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1) xgb_auc = auc(fpr, tpr) run.log("AUC", np.float(xgb_auc)) os.makedirs('outputs', exist_ok=True) joblib.dump(value=xgb_model, filename='outputs/model.pkl')
def unregister_dataset(workspace=None, dataset_name=None, logger=None): try: dataset = Dataset.get_by_name(workspace, dataset_name) except: print('There is no dataset registered with name "{}".'.format( dataset_name)) return dataset.unregister_all_versions() print('Successfully unregistered datasets with name "{}".'.format( dataset_name))
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] #location = request.json['location'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df['ActionTaken'].values x_df = stock_dataset_df.drop(['ActionTaken'], axis=1) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] #n_cross_validations = request.json['n_cross_validations'] try: automl_config = AutoMLConfig( task=tasks, X=x_df, y=y_df, iterations=iterations, iteration_timeout_minutes=iteration_timeout_minutes, primary_metric=primary_metric, #n_cross_validations=n_cross_validations, preprocess=True, ) experiment = Experiment(ws, ExperimentName) run = experiment.submit(config=automl_config, show_output=True) best_model,fitted_model = run.get_output() return 'ok' except: return 'error'
def get_df_from_dataset(dataset_path, dataset_name, dataset_is_remote=False): """ Return a DataFrame by reading the dataset from either a directory containing csv files or Azure Dataset """ if dataset_is_remote: workspace = package_utils.get_workspace() df = Dataset.get_by_name(workspace=workspace, name=dataset_name).to_pandas_dataframe() else: df = get_df_from_directory(pathlib.Path(dataset_path, dataset_name)) return df
def main(): parser = argparse.ArgumentParser() parser.add_argument('--n_estimators', type=int, default=50, help="Number of trees") parser.add_argument('--max_depth', type=int, default=3, help="Maximum depth of the trees to used") parser.add_argument('--min_samples_split', type=int, default=2, help="Minimum samples") run = Run.get_context() workspace = run.experiment.workspace key = "heart-failure" #"Heart failure" description_text = "Heart failure dataset for udacity capstone" if key in workspace.datasets.keys(): found = True dataset = Dataset.get_by_name(workspace, name='heart-failure') df = dataset.to_pandas_dataframe() X, y = df.loc[:, ~df.columns.isin(["DEATH_EVENT"])], df.loc[:, "DEATH_EVENT"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123) args = parser.parse_args() run.log("n_estimators:", np.int(args.n_estimators)) run.log("max_depth:", np.int(args.max_depth)) run.log("min_samples_split:", np.int(args.min_samples_split)) rf_model = RandomForestClassifier(n_estimators=np.int(args.n_estimators), max_depth=np.int(args.max_depth), min_samples_split=np.int( args.min_samples_split), random_state=123) rf_model.fit(X_train, y_train) y_pred = rf_model.predict(X_test) fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1) rf_auc = auc(fpr, tpr) run.log("AUC", np.float(rf_auc)) os.makedirs('outputs', exist_ok=True) joblib.dump(value=rf_model, filename='outputs/model.pkl')
def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( '--C', type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument('--max_iter', type=int, default=100, help="Maximum number of iterations to converge") args = parser.parse_args() run = Run.get_context() workspace = run.experiment.workspace run.log("Regularization Strength:", np.float(args.C)) run.log("Max iterations:", np.int(args.max_iter)) #The dataset is registered using Python SDK in the notebook dataset_name = 'Framingham-Prepared' # Get a dataset by name ds = Dataset.get_by_name(workspace=workspace, name=dataset_name) x, y = clean_data(ds) # TODO: Split data into train and test sets. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=223) model = LogisticRegression(C=args.C, max_iter=args.max_iter).fit(x_train, y_train) accuracy = model.score(x_test, y_test) run.log("Accuracy", np.float(accuracy)) #save the best model os.makedirs('outputs', exist_ok=True) joblib.dump(value=model, filename='outputs/model.joblib')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--kernel', type=str, default='linear', help='Kernel type to be used in the algorithm') parser.add_argument('--penalty', type=float, default=1.0, help='Penalty parameter of the error term') args = parser.parse_args() run.log('Kernel type', np.str(args.kernel)) run.log('Penalty', np.float(args.penalty)) heart_dataset = Dataset.get_by_name(workspace=ws, name='Heart-Failure') df = heart_dataset.to_pandas_dataframe() y = df[df.columns[-1]] X = df.drop(df.columns[-1],axis=1) scaler = StandardScaler() scaled_X = scaler.fit_transform(X) # dividing X, y into train and test data X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, random_state=0) # training a linear SVM classifier from sklearn.svm import SVC svm_model_linear = SVC(kernel=args.kernel, C=args.penalty).fit(X_train, y_train) svm_predictions = svm_model_linear.predict(X_test) # model accuracy for X_test accuracy = accuracy_score(svm_predictions, y_test) #print('Accuracy of SVM classifier on test set: {:.2f}'.format(accuracy)) run.log('Accuracy', np.float(accuracy)) os.makedirs('outputs', exist_ok=True) # files saved in the "outputs" folder are automatically uploaded into run history joblib.dump(svm_model_linear, 'outputs/model.joblib')
def DataBlob(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] #location = request.json['location'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') ds = ws.get_default_datastore() print(ds.datastore_type, ds.account_name, ds.container_name) try: stock_ds = Dataset.Tabular.from_delimited_files( path=ds.path(file_name)) stock_ds = stock_ds.register(workspace=ws, name=file_name, description='stock training data') print('Found existing file name') return "This file name exist. Please rename or upload new file" except: print('Uploading new file, please wait') stock_dataset = Dataset.Tabular.from_delimited_files( path=ds.path(file_name)) stock_dataset = stock_dataset.register(workspace=ws, name=file_name, description='stock training data') #file_name = json.loads(file_name) print(type(file_name)) new_data = Dataset.get_by_name(ws, file_name, version='latest') print(new_data.name) print(type(new_data.name)) stock_dataset_df = eval(new_data.name).to_pandas_dataframe() print('file successfully recieved.') stock_dataset_json = stock_dataset_df.to_json(orient='split') return stock_dataset_json
def get_dataset(workspace=None, dataset_name=None, dataset_version=None, dataset_id=None, logger=None): if dataset_name is None and dataset_id is None: raise UserErrorException('Argument {} or {} must be specified'.format( DATASET_NAME.long_form, DATASET_ID.long_form)) if dataset_name is not None and dataset_id is not None: raise UserErrorException( 'Arguments {} and {} cannot be specified at the same time'.format( DATASET_NAME.long_form, DATASET_ID.long_form)) if dataset_version != DATASET_VERSION.default and dataset_name is None: raise UserErrorException( 'Argument {} must be specified with {}'.format( DATASET_VERSION.long_form, DATASET_NAME.long_form)) dataset_version = dataset_version or DATASET_VERSION.default if dataset_name is not None: dataset = Dataset.get_by_name(workspace, dataset_name, dataset_version) else: dataset = Dataset.get_by_id(workspace, dataset_id) return _dataset_to_printable(dataset)
def main(): print(azureml.core.VERSION) dataset_name = getRuntimeArgs() run = Run.get_context() ws = run.experiment.workspace ds = Dataset.get_by_name(workspace=ws, name=dataset_name) automl_settings = { "task": 'classification', "verbosity": logging.INFO, "primary_metric": 'accuracy', "experiment_timeout_hours": 0.05, "n_cross_validations": 3, "enable_stack_ensemble": False, "enable_voting_ensemble": False, "model_explainability": True, "preprocess": True, "max_cores_per_iteration": -1, "max_concurrent_iterations": 4, "training_data": ds, "drop_column_names": ['Sno'], "label_column_name": 'Risk' } automl_config = AutoMLConfig(**automl_settings) run = run.submit_child(automl_config, show_output=True) best_run, fitted_model = run.get_output() output_dir = './outputs/' os.makedirs(output_dir, exist_ok=True) shutil.copy2('automl.log', output_dir) with open(output_dir + 'best_run.json', 'w') as f: json.dump(best_run, f)
def _setup_dataset(self, ds_name, data_paths): """ registers datasets with azureml workspace :param str ds_name: [required] name to give the dataset in azureml. :param str data_paths: [required] list of paths to your data on the datastore. """ self.named_ds = [] count = 1 for data_path in data_paths: curr_name = ds_name + str(count) path_on_datastore = self.blob_ds.path(data_path) input_ds = Dataset.File.from_files(path=path_on_datastore, validate=False) try: registered_ds = input_ds.register(workspace=self.ws, name=curr_name, create_new_version=True) except Exception as e: n, v = self._parse_exception(e) registered_ds = Dataset.get_by_name(self.ws, name=n, version=v) self.named_ds.append(registered_ds.as_named_input(curr_name)) count = count + 1
ds.upload(src_dir=file_path, target_path=None, overwrite=True, show_progress=True) stock_ds = Dataset.Tabular.from_delimited_files(path=datastore.path(file_name)) stock_ds = stock_ds.register(workspace=ws, name=file_name, description='Introact Owner Data') compute_target = AmlCompute(ws, cluster_name) print('Found existing AML compute context.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) X = df.drop_columns(columns=[target_var]) y = df.keep_columns(columns=[target_var], validate=True) print(y) #y = diabetes.pop('Y') #X_train, X_test, y_train, y_test = train_test_split(diabetes, y, test_size=0.2, random_state=0) #data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} conda_run_config = RunConfiguration(framework="python") conda_run_config.environment.docker.enabled = True conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy', 'py-xgboost<=0.80']) conda_run_config.environment.python.conda_dependencies = cd print('run config is ready')
def main(req: func.HttpRequest) -> func.HttpResponse: logging.info('Python HTTP trigger function processed a request.') # interactive_auth = InteractiveLoginAuthentication(tenant_id="b88f1ff4-e3ab-4adb-83e6-4ea99d41c665") sp = ServicePrincipalAuthentication(tenant_id='b88f1ff4-e3ab-4adb-83e6-4ea99d41c665', service_principal_id='2e90efa1-d53f-45d4-96d8-7adde8a02cdc', service_principal_password='******' ) query = req.params.get('query') if not query: try: req_body = req.get_json() except ValueError: pass else: query = req_body.get('query') if query == 'run': try: ws = Workspace.get(name="vrd-ml", subscription_id="b9301f45-7da5-41f6-9125-1331de94f262", resource_group="vrd-dev-asia", auth=sp ) compute_name = 'automl-compute' if compute_name in ws.compute_targets: compute_target = ws.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: print('found compute target. just use it. ' + compute_name) else: print('creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_D2_V2', min_nodes = 0, max_nodes = 4) compute_target = ComputeTarget.create(ws, compute_name, provisioning_config) compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20) dataset = Dataset.get_by_name(ws, name='datasetfunc') train_data, test_data = dataset.random_split(percentage=0.8, seed=223) label = "ERP" automl_config = AutoMLConfig(task = 'regression', compute_target = compute_name, training_data = train_data, label_column_name = label, validation_data = test_data, # n_cross_validations= 3, primary_metric= 'r2_score', enable_early_stopping= True, experiment_timeout_hours= 0.3, max_concurrent_iterations= 4, max_cores_per_iteration= -1, verbosity= logging.INFO ) experiment_name = 'expfunc' experiment = Experiment(workspace = ws, name = experiment_name) run = experiment.submit(automl_config, show_output = True) run run.wait_for_completion() except ValueError: pass return func.HttpResponse("AutoML Run Completed") else: return func.HttpResponse( "This HTTP triggered function executed successfully. Pass a name in the query string or in the request body for a personalized response.", status_code=200 )
return sum(edge_lengths) else: raise ValueError('Path not foun') GRAPH_FILE_PATH = "https://grab5033896937.blob.core.windows.net/azureml/Dataset/grab/singapore.graphml" try: # load workspace configuration from the config.json file in the current folder. #ws = Workspace.from_config() ws = Workspace.get(name="<<Insert Name>>", subscription_id="<<Insert Subscription Id>>", resource_group="<<Insert Resource Group>>") dataset = Dataset.get_by_name(ws, 'sg_graphml') # list the files referenced by sg_graphml dataset GRAPH_FILE_PATH = dataset.to_path() G = ox.load_graphml(GRAPH_FILE_PATH) except: G = ox.graph_from_place('Singapore', network_type='drive') ox.save_graphml(G, filepath=GRAPH_FILE_PATH) def init(): global model # Get the path where the deployed model can be found. model_path = Model.get_model_path('grab-model-reg') model = joblib.load(model_path)
def main(): e = Env() # Get Azure machine learning workspace aml_workspace = Workspace.get(name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group) print(f"get_workspace: {aml_workspace}") # Get Azure machine learning cluster aml_compute = get_compute(aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print(f"aml_compute: {aml_compute}") # Prepare the dataset input data_store = aml_workspace.get_default_datastore() print("data_store: %s" % data_store.name) train_ds_name = e.dataset_name train_data_path = e.datafile_path sources_directory_train = e.sources_directory_train pipeline_name = e.pipeline_name build_id = e.build_id # Register the train dataset if (train_ds_name not in aml_workspace.datasets): train_path_on_datastore = train_data_path # +'/*.csv' train_ds_data_path = [(data_store, train_path_on_datastore)] train_ds = Dataset.File.from_files(path=train_ds_data_path, validate=False) train_ds = train_ds.register(workspace=aml_workspace, name=train_ds_name, description='train data', tags={'format': 'CSV'}, create_new_version=True) else: train_ds = Dataset.get_by_name(aml_workspace, train_ds_name) train_input = train_ds.as_named_input('train_input') # Conda environment environment = Environment.from_conda_specification( "myenv", os.path.join(sources_directory_train, "conda_dependencies.yml")) # Logging into Azure Application Insights env = { "APPLICATIONINSIGHTS_CONNECTION_STRING": e.applicationinsights_connection_string } env['AZUREML_FLUSH_INGEST_WAIT'] = '' env['DISABLE_ENV_MISMATCH'] = True environment.environment_variables = env from ff.util.helper import build_parallel_run_config # PLEASE MODIFY the following three settings based on your compute and # experiment timeout. process_count_per_node = 6 node_count = 3 # this timeout(in seconds) is inline with AutoML experiment timeout or (no # of iterations * iteration timeout) run_invocation_timeout = 3700 parallel_run_config = build_parallel_run_config(sources_directory_train, environment, aml_compute, node_count, process_count_per_node, run_invocation_timeout) from azureml.pipeline.core import PipelineData output_dir = PipelineData(name="training_output", datastore=data_store) #from azureml.contrib.pipeline.steps import ParallelRunStep from azureml.pipeline.steps import ParallelRunStep parallel_run_step = ParallelRunStep( name="many-models-training", parallel_run_config=parallel_run_config, allow_reuse=False, inputs=[train_input], output=output_dir # models=[], # arguments=[] ) pipeline = Pipeline(workspace=aml_workspace, steps=parallel_run_step) pipeline._set_experiment_name pipeline.validate() published_pipeline = pipeline.publish(name=pipeline_name, description="FF AutomML pipeline", version=build_id) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
def main(): run = Run.get_context() ws = run.experiment.workspace # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument("--learning_rate", type=float, default=0.3, help="Boosting learning rate (xgb's 'eta')") parser.add_argument("--n_estimators", type=int, default=100, help="Number of boosting rounds") parser.add_argument("--max_depth", type=int, default=6, help="Maximum tree depth for base learners") parser.add_argument( "--min_child_weight", type=int, default=1, help="Minimum sum of instance weight(hessian) needed in a child") parser.add_argument( "--gamma", type=float, default=0, help= "Minimum loss reduction required to make a further partition on a leaf node of the tree" ) parser.add_argument("--subsample", type=float, default=1.0, help="Subsample ratio of the training instance") parser.add_argument( "--colsample_bytree", type=float, default=1.0, help="Subsample ratio of columns when constructing each tree") parser.add_argument("--reg_lambda", type=float, default=1.0, help="L1 regularization term on weights") parser.add_argument("--reg_alpha", type=float, default=0, help="L2 regularization term on weights") args = parser.parse_args() run.log("learning_rate:", np.float(args.learning_rate)) run.log("n_estimators:", int(args.n_estimators)) run.log("max_depth:", int(args.max_depth)) run.log("min_child_weight:", int(args.min_child_weight)) run.log("gamma:", np.float(args.gamma)) run.log("subsample:", np.float(args.subsample)) run.log("colsample_bytree:", np.float(args.colsample_bytree)) run.log("reg_lambda:", np.float(args.reg_lambda)) run.log("reg_alpha:", np.float(args.reg_alpha)) dataset = Dataset.get_by_name(ws, name="attrition_train") df = dataset.to_pandas_dataframe() X_train, X_val, y_train, y_val = data_prep(df) clf = XGBClassifier(learning_rate=args.learning_rate, n_estimators=args.n_estimators, max_depth=args.max_depth, min_child_weight=args.min_child_weight, gamma=args.gamma, subsample=args.subsample, colsample_bytree=args.colsample_bytree, reg_lambda=args.reg_lambda, reg_alpha=args.reg_alpha) clf.fit(X_train, y_train) accuracy = np.round(clf.score(X_val, y_val), 3) run.log("accuracy", np.float(accuracy)) auc_weighted = np.round( roc_auc_score(y_val, clf.predict(X_val), average='weighted'), 3) run.log("AUC_weighted", np.float(auc_weighted)) os.makedirs("outputs", exist_ok=True) # files saved in the "outputs" folder are automatically uploaded into run history joblib.dump(clf, "outputs/hyperdrive_model.pkl")
else: print( 'Current model does NOT perform better and thus will NOT be deployed!' ) eval_info = {} eval_info["model_acc"] = latest_model_accuracy eval_info["deployed_model_acc"] = current_model_accuracy eval_info["deploy_model"] = deploy_model eval_info['train_run_id'] = latest_model_run_id eval_info['eval_run_id'] = run.id if deploy_model: os.chdir(args.input) surge_ds_name = 'Surge Dataset' surge_ds = Dataset.get_by_name(workspace=ws, name=surge_ds_name) model_description = 'Machine learning model to classify the surge price category' # Create model datasheet from datetime import datetime from pytz import timezone etz = 'US/Eastern' time_stamp = datetime.now(timezone(etz)) time_stamp_str = time_stamp.strftime('%A %m/%d/%Y %I:%M:%S%p') model_tags = {} model_tags['title'] = 'Surge catefory classifier' model_tags[ 'datasheet_description'] = 'Data sheet last updated: ' + time_stamp_str model_tags[
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] location = request.json['location'] target_var = request.json['target_var'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df[target_var].values x_df = stock_dataset_df.drop([target_var], axis=1) print(y_df) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] best_model = request.json['best_model'] #n_cross_validations = request.json['n_cross_validations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations, "verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', path= 'D:\\Stock_Prediction\\AutoML_Azure\\python\\Flask_API_Azure\\log', #compute_target = 'Automlvm', X=x_df, y=y_df, **automl_settings, ) experiment = Experiment(ws, ExperimentName) remote_run = experiment.submit(automl_config, show_output=True) best_run, fitted_model = remote_run.get_output() #print(best_run) print(best_run.get_file_names()) #Register the model from datetime import date model = best_run.register_model(model_name=best_model + str(date.today()), model_path='outputs/model.pkl') print(model.name, model.id, model.version, sep='\t') children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) rundata.rename(column={ 0: "one", 1: "two", 2: "three", 3: "four", 4: "five", 5: "six", 6: "seven", 7: "right", 8: "nine", 9: "ten", }, inplace=True) rundata_toJson = rundata.to_json(orient='columns') print(rundata_toJson) return rundata_toJson except: return 'error'
from azureml.core.experiment import Experiment from azureml.core import Run experiment = Experiment(ws, 'Myexp2_v1_test21') best_run = Run(experiment=experiment, run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8') fitted_model = Run(experiment=experiment, run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8') #print(best_run.register_model() print(fitted_model) # Get a dataset by name from azureml.core.dataset import Dataset file_name = '2018Q4PredictionTrainedSet101.csv' stock_dataset = Dataset.get_by_name(ws, '2018Q4PredictionTrainedSet101.csv') #stock_dataset #dataset = Dataset.Tabular.from_delimited_files(stock_dataset) stock_dataset.to_pandas_dataframe().describe() stock_dataset.take(3).to_pandas_dataframe() X = stock_dataset.drop_columns(columns=['ActionTaken']) y = stock_dataset.keep_columns(columns=['ActionTaken'], validate=True) print(y) #print('X and y are ready!') stock_dataset_df = stock_dataset.to_pandas_dataframe() y_df = stock_dataset_df['ActionTaken'].values x_df = stock_dataset_df.drop(['ActionTaken'], axis=1) y_predict = fitted_model.predict(x_df) print(y_predict)
def main( workspace=None, dataset_trainandvalidate_name=config.get_default_dataset_name( "trainandvalidate"), ): """ Return AutoMLConfig """ if not workspace: workspace = package_utils.get_workspace() args = aml_compute.parse_args() cluster_max_nodes = 5 args.cluster_max_nodes = cluster_max_nodes args.cluster_sku = "Standard_D12_v2" compute_target = aml_compute.main(args) logger.info(msg="main", extra={"compute_target": compute_target.serialize()}) trainandvalidate = Dataset.get_by_name( workspace=workspace, name=dataset_trainandvalidate_name, ) model_settings = { "task": "classification", "primary_metric": "norm_macro_recall", } ensemble_settings = { "iterations": 15, "allowed_models": ["LightGBM", "LogisticRegression", "SGD", "XGBoostClassifier"], "enable_voting_ensemble": True, "enable_stack_ensemble": False, } dataset_settings = { "validation_size": 0.3, "featurization": "auto", "training_data": trainandvalidate, "label_column_name": "Label", } compute_settings = { "compute_target": compute_target, "max_cores_per_iteration": -1, "max_concurrent_iterations": cluster_max_nodes, "experiment_timeout_hours": 1.5, } automl_config = AutoMLConfig( **model_settings, **ensemble_settings, **dataset_settings, **compute_settings, ) return automl_config
def RunAutoML(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] #location = request.json['location'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) y_df = stock_dataset_df['ActionTaken'].values x_df = stock_dataset_df.drop(['ActionTaken'], axis=1) print(y_df) ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] #n_cross_validations = request.json['n_cross_validations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations, "verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', path=os.getcwd(), #compute_target = 'Automlvm', X=x_df, y=y_df, **automl_settings, ) experiment = Experiment(ws, 'automl_local_v2') remote_run = experiment.submit(automl_config, show_output=True) children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) rundata_toJson = rundata.to_json(orient='columns') return rundata_toJson except: return 'error'
def Prediction(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] location = request.json['location'] file_name = request.json['file_name'] target_var = request.json['target_var'] best_model = request.json['best_model'] Model_path = request.json['Model_path'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') X = df.drop_columns(columns=[target_var]) y = df.keep_columns(columns=[target_var], validate=True) y_df = stock_dataset_df[target_var].values x_df = stock_dataset_df.drop([target_var], axis=1) print(y) #from azureml.core import Run #experiment=Experiment(ws, workspace_name) #from azureml.core.model import Model #model = Model(ws, name=Model_path) #model.download(exist_ok=True) from sklearn.externals import joblib cwd = 'D:\DCSAIAUTOML\BestModels\Azure' model_path = os.path.join(cwd, Model_path, best_model, "outputs") #model_path1 = os.path.join(model_path, "outputs", "model.pkl") print(model_path) os.chdir(model_path) model = joblib.load('model.pkl') #best_run = Run(experiment=experiment, run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8') #fitted_model = Run(experiment=experiment, run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8') print(model) try: y_predict = model.predict(x_df) print(y_predict) #prediction_toJson = y_predict.to_json(orient='columns') #print(prediction_toJson) df = pd.DataFrame(y_predict) df.rename(columns={0: "Prediction"}, inplace=True) #stock_df = stock_dataset_df[['SepalLengthCm','SepalWidthCm','Species']] result = pd.concat([stock_dataset_df, df], axis=1) result.to_csv( 'D:\\PredictionResult\\Azure\\prediction_azure_health.csv', index=False, date_format='%Y%m%d') result.head() prediction_toJson = result.to_json(orient='records') return prediction_toJson except Exception as e: error_statement = str(e) print("Error statement: ", error_statement) return error_statement
deploy_model = True print('Current model performs better and will be deployed!') else: print('Current model does NOT perform better and thus will NOT be deployed!') eval_info = {} eval_info["model_acc"] = latest_model_accuracy eval_info["deployed_model_acc"] = current_model_accuracy eval_info["deploy_model"] = deploy_model eval_info['train_run_id'] = latest_model_run_id eval_info['eval_run_id'] = run.id if deploy_model: os.chdir(args.input) cardata_ds_name = 'connected_car_components' cardata_ds = Dataset.get_by_name(workspace=ws, name=cardata_ds_name) glove_ds_name = 'glove_6B_100d' glove_ds = Dataset.get_by_name(workspace=ws, name=glove_ds_name) model_description = 'Deep learning model to classify the descriptions of car components as compliant or non-compliant.' # Create model datasheet from datetime import datetime from pytz import timezone etz = 'US/Eastern' time_stamp = datetime.now(timezone(etz)) time_stamp_str = time_stamp.strftime('%A %m/%d/%Y %I:%M:%S%p') model_tags = {} model_tags['title'] = 'Connected car components classifier' model_tags['datasheet_description'] = 'Data sheet last updated: ' + time_stamp_str model_tags['details'] = 'This model was developed for automatically classifying car components as compliant or not compliant. The model leverages deep learning technologies with Natural Language Processing techniques to scan through vehicle specification documents to find compliance issues with new regulations.'
def RunAutoMLReg(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] location = request.json['location'] target_var = request.json['target_var'] cluster_name = request.json['cluster_name'] best_model = request.json['best_model'] #best_model = request.json['best_model'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') #compute_target = AmlCompute(ws, cluster_name) compute_target = ws.compute_targets[cluster_name] print('Found existing AML compute context.') dataset_name = file_name # Get a dataset by name df = Dataset.get_by_name(workspace=ws, name=dataset_name) #stock_dataset_df = df.to_pandas_dataframe() print('file successfully recieved.') #stock_dataset_df.head() #stock_dataset_json = stock_dataset_df.to_json(orient='split') #print(stock_dataset_json) X = df.drop_columns(columns=[target_var]) y = df.keep_columns(columns=[target_var], validate=True) #y_df = stock_dataset_df[target_var].values #x_df = stock_dataset_df.drop([target_var], axis=1) print(y) # create a new RunConfig object conda_run_config = RunConfiguration(framework="python") conda_run_config.environment.docker.enabled = True conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy', 'py-xgboost<=0.90']) conda_run_config.environment.python.conda_dependencies = cd print('run config is ready') ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] max_concurrent_iterations = request.json['max_concurrent_iterations'] try: automl_settings = { "name": ExperimentName, "iteration_timeout_minutes": iteration_timeout_minutes, "featurization": 'auto', "iterations": iterations, "n_cross_validations": n_cross_validations, "primary_metric": primary_metric, "preprocess": True, "max_concurrent_iterations": max_concurrent_iterations #"verbosity": logging.INFO } automl_config = AutoMLConfig( task=tasks, debug_log='automl_errors.log', blacklist_models=['XGBoost'], #path=os.getcwd(), compute_target=compute_target, #run_configuration=conda_run_config, X=X, y=y, **automl_settings, ) experiment = Experiment(ws, ExperimentName) remote_run = experiment.submit(automl_config, show_output=True) remote_run.flush(timeout_seconds=400) children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(axis=1, by=primary_metric) rundata = rundata.drop([ 'mean_absolute_percentage_error', 'normalized_median_absolute_error', 'normalized_root_mean_squared_log_error', 'root_mean_squared_log_error' ]) rundata.rename(columns={ 0: "one", 1: "two", 2: "three", 3: "four", 4: "five", 5: "six", 6: "seven", 7: "eight", 8: "nine", 9: "ten", }, inplace=True) iterations_toJson = rundata.to_json(orient='columns') print(iterations_toJson) best_run, fitted_model = remote_run.get_output() best_run_toJson = best_run.get_metrics() cwd = 'D:/DCSAIAUTOML/BestModels/Azure' best_model_name = best_run.name model = remote_run.register_model(description=best_model) print(model.name, model.id, model.version, sep='\t') model_path = os.path.join(cwd, best_model, best_model_name) print(model_path) #print("Model DownLoad Complete") #model = Model(workspace=ws, name=model.name) #model.download_files(target_dir=model_path) #dict = {} #dict['iterations_toJson'] = iterations_toJson #dict['best_run_toJson'] = best_run_toJson #print(best_run.get_file_names()) #Register the model #from datetime import date best_model_id = best_run.name var1 = "@" var2 = var1 + best_model_id Reg_model_name = model.name var4 = var1 + Reg_model_name best_run.flush(timeout_seconds=3600) best_run.download_files(output_directory=model_path) # importing required modules #import shutil #output_path = os.path.join(model_path, best_model_id) #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best" #dir_name1 = "D:\\DCSAIAUTOML\\BestModels\\Azure\\my_azure_best\\my_azure_best" #shutil.make_archive(model_path,'zip',model_path) #zipf = zipfile.ZipFile(best_model_id+'.zip', 'w', zipfile.ZIP_DEFLATED) #for root, dirs, files in os.walk(model_path): #for file in files: #zipf.write(os.path.join(root, file)) #def zipdir(path, ziph): # ziph is zipfile handle #import os #for root, dirs, files in os.walk(path): #for file in files: #ziph.write(os.path.join(root, file)) #zipdir(model_path, zipf) #remote_run.clean_preprocessor_cache() print("ready to return") var5 = "no exception" return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1, var5) #return iterations_toJson except Exception as e: error_statement = str(e) print("Error statement: ", error_statement) model_path1 = os.path.join(model_path, 'outputs') file_name = 'model.pkl' print("in exception: ", model_path1) src = 'D:\\Final Script_dev' full_file_name = os.path.join(src, file_name) import shutil #remote_run.download_file('model.pkl', output_file_path=model_path1) if os.path.isfile(full_file_name): shutil.copy(full_file_name, model_path1) return '{} {} {} {} {}'.format(iterations_toJson, var2, var4, var1, error_statement)
workspace = Workspace.from_config(auth=AzureCliAuthentication()) # Define the conda dependencies cd = CondaDependencies(conda_dependencies_file_path=os.path.join( os.path.dirname(os.path.realpath(__file__)), 'conda_dependencies_sklearn.yml')) # define compute compute_target = '20cpucluster' # define data set names input_name_train = 'newsgroups_train' input_name_test = 'newsgroups_test' # Retrieve datsets dataset_train = Dataset.get_by_name(workspace, name=input_name_train) dataset_test = Dataset.get_by_name(workspace, name=input_name_test) # Runconfig amlcompute_run_config = RunConfiguration( script="train.py", conda_dependencies=cd, framework='Python', ) amlcompute_run_config.environment.docker.enabled = True amlcompute_run_config.environment.spark.precache_packages = False amlcompute_run_config.target = compute_target amlcompute_run_config.data = { input_name_train: load_data(dataset_train, input_name_train), input_name_test: load_data(dataset_test, input_name_test)
def RunAutoMLForecast(): subscription_id = request.json['subscription_id'] resource_group = request.json['resource_group'] workspace_name = request.json['workspace_name'] file_name = request.json['file_name'] location = request.json['location'] target_var = request.json['target_var'] cluster_name = request.json['cluster_name'] best_model = request.json['best_model'] time_column_name = request.json['time_column_name'] max_horizon = request.json['max_horizon'] ws = Workspace(subscription_id=subscription_id, resource_group=resource_group, workspace_name=workspace_name) print("Found workspace {} at location {}".format(ws.name, ws.location)) print('Found existing Workspace.') compute_target = AmlCompute(ws, cluster_name) print('Found existing AML compute context.') dataset_name = file_name time_column_name = time_column_name # Get a dataset by name dataset = Dataset.get_by_name(workspace=ws, name=dataset_name).with_timestamp_columns( fine_grain_timestamp=time_column_name) print(dataset) #df_ts = Dataset.Tabular.from_delimited_files(df_ts) dataset.to_pandas_dataframe().describe() dataset.take(3).to_pandas_dataframe() print(dataset) #y_df = df_ts[target_var].values #x_df = df_ts.drop([target_var], axis=1) print('file successfully recieved.') #stock_dataset_df.head() # create a new RunConfig object conda_run_config = RunConfiguration(framework="python") conda_run_config.environment.docker.enabled = True conda_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'], conda_packages=['numpy', 'py-xgboost<=0.80']) conda_run_config.environment.python.conda_dependencies = cd print('run config is ready') ExperimentName = request.json['ExperimentName'] tasks = request.json['tasks'] iterations = request.json['iterations'] n_cross_validations = request.json['n_cross_validations'] iteration_timeout_minutes = request.json['iteration_timeout_minutes'] primary_metric = request.json['primary_metric'] #max_concurrent_iterations = request.json['max_concurrent_iterations'] automl_settings = { 'time_column_name': time_column_name, 'max_horizon': max_horizon, "iterations": iterations, } automl_config = AutoMLConfig( task=tasks, primary_metric=primary_metric, #blacklist_models = ['ExtremeRandomTrees', 'AutoArima', 'Prophet'], experiment_timeout_minutes=iteration_timeout_minutes, training_data=dataset, label_column_name=target_var, compute_target=compute_target, enable_early_stopping=True, n_cross_validations=n_cross_validations, #verbosity=logging.INFO, **automl_settings) print("AutoML config created.") experiment = Experiment(ws, ExperimentName) remote_run = experiment.submit(automl_config, show_output=True) children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(axis=1, by=primary_metric) rundata.rename(columns={ 0: "one", 1: "two", 2: "three", 3: "four", 4: "five", 5: "six", 6: "seven", 7: "eight", 8: "nine", 9: "ten", }, inplace=True) iterations_toJson = rundata.to_json(orient='columns') print(iterations_toJson) best_run, fitted_model = remote_run.get_output() #best_run_toJson = best_run.get_metrics() #dict = {} #dict['iterations_toJson'] = iterations_toJson #dict['best_run_toJson'] = best_run_toJson #print(best_run.get_file_names()) #Register the model #from datetime import date model = remote_run.register_model(model_name=best_model, description='AutoML Model') print(model.name, model.id, model.version, sep='\t') best_model = model.name best_model var1 = "@" var2 = var1 + best_model return '{} {}'.format(iterations_toJson, var2)
experiment = Experiment(ws, '<<experiment_name>>') automl_run = Run(experiment=experiment, run_id='<<run_id>>') # Check if this AutoML model is explainable if not automl_check_model_if_explainable(automl_run): raise Exception("Model explanations is currently not supported for " + automl_run.get_properties().get('run_algorithm')) # Download the best model from the artifact store automl_run.download_file(name=MODEL_PATH, output_file_path='model.pkl') # Load the AutoML model into memory fitted_model = joblib.load('model.pkl') # Get the train dataset from the workspace train_dataset = Dataset.get_by_name(workspace=ws, name='<<train_dataset_name>>') # Drop the lablled column to get the training set. X_train = train_dataset.drop_columns(columns=['<<target_column_name>>']) y_train = train_dataset.keep_columns(columns=['<<target_column_name>>'], validate=True) # Get the train dataset from the workspace test_dataset = Dataset.get_by_name(workspace=ws, name='<<test_dataset_name>>') # Drop the lablled column to get the testing set. X_test = test_dataset.drop_columns(columns=['<<target_column_name>>']) # Setup the class for explaining the AtuoML models automl_explainer_setup_obj = automl_setup_model_explanations(fitted_model, '<<task>>', X=X_train, X_test=X_test,
from sklearn.linear_model import LogisticRegression import argparse import os import numpy as np from sklearn.metrics import mean_squared_error import joblib from sklearn.model_selection import train_test_split import pandas as pd from azureml.core.run import Run from azureml.core.dataset import Dataset from azureml.core import Experiment, Workspace from azureml.data.dataset_factory import TabularDatasetFactory # Load dataset into data variable: ws = Workspace.get("quick-starts-ws-126078") ds = Dataset.get_by_name(ws, name='Heart-Failure') data = ds.to_pandas_dataframe() #Split Target and Features in y and x respectively in the clean_data function def clean_data(data): y = data['DEATH_EVENT'] x = data.drop(['DEATH_EVENT'], axis=1) return x, y x, y = clean_data(data) # Split data into train and test sets.