def main(unused_argv): data_root = os.path.join("outputs", "MNIST") mnist = None tf_config = os.environ.get("TF_CONFIG") if not tf_config or tf_config == "": raise ValueError("TF_CONFIG not found.") tf_config_json = json.loads(tf_config) cluster = tf_config_json.get('cluster') job_name = tf_config_json.get('task', {}).get('type') task_index = tf_config_json.get('task', {}).get('index') job_name = "worker" if job_name == "master" else job_name sentinel_path = os.path.join(data_root, "complete.txt") if job_name == "worker" and task_index == 0: mnist = input_data.read_data_sets(data_root, one_hot=True) with open(sentinel_path, 'w+') as f: f.write("download complete") else: while not os.path.exists(sentinel_path): time.sleep(0.01) mnist = input_data.read_data_sets(data_root, one_hot=True) if FLAGS.download_only: sys.exit(0) print("job name = %s" % job_name) print("task index = %d" % task_index) print("number of GPUs = %d" % FLAGS.num_gpus) # Construct the cluster and start the server cluster_spec = tf.train.ClusterSpec(cluster) # Get the number of workers. num_workers = len(cluster_spec.task_indices("worker")) if not FLAGS.existing_servers: # Not using existing servers. Create an in-process server. server = tf.train.Server( cluster_spec, job_name=job_name, task_index=task_index) if job_name == "ps": server.join() is_chief = (task_index == 0) if FLAGS.num_gpus > 0: # Avoid gpu allocation conflict: now allocate task_num -> #gpu # for each worker in the corresponding machine gpu = (task_index % FLAGS.num_gpus) worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu) elif FLAGS.num_gpus == 0: # Just allocate the CPU to worker server cpu = 0 worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu) # The device setter will automatically place Variables ops on separate # parameter servers (ps). The non-Variable ops will be placed on the workers. # The ps use CPU and workers use corresponding GPU with tf.device( tf.train.replica_device_setter( worker_device=worker_device, ps_device="/job:ps/cpu:0", cluster=cluster)): global_step = tf.Variable(0, name="global_step", trainable=False) # Variables of the hidden layer hid_w = tf.Variable( tf.truncated_normal( [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units], stddev=1.0 / IMAGE_PIXELS), name="hid_w") hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b") # Variables of the softmax layer sm_w = tf.Variable( tf.truncated_normal( [FLAGS.hidden_units, 10], stddev=1.0 / math.sqrt(FLAGS.hidden_units)), name="sm_w") sm_b = tf.Variable(tf.zeros([10]), name="sm_b") # Ops: located on the worker specified with task_index x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS]) y_ = tf.placeholder(tf.float32, [None, 10]) hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b) hid = tf.nn.relu(hid_lin) y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b)) cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0))) opt = tf.train.AdamOptimizer(FLAGS.learning_rate) if FLAGS.sync_replicas: if FLAGS.replicas_to_aggregate is None: replicas_to_aggregate = num_workers else: replicas_to_aggregate = FLAGS.replicas_to_aggregate opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=replicas_to_aggregate, total_num_replicas=num_workers, name="mnist_sync_replicas") train_step = opt.minimize(cross_entropy, global_step=global_step) if FLAGS.sync_replicas: local_init_op = opt.local_step_init_op if is_chief: local_init_op = opt.chief_init_op ready_for_local_init_op = opt.ready_for_local_init_op # Initial token and chief queue runners required by the sync_replicas mode chief_queue_runner = opt.get_chief_queue_runner() sync_init_op = opt.get_init_tokens_op() init_op = tf.global_variables_initializer() train_dir = tempfile.mkdtemp() if FLAGS.sync_replicas: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, local_init_op=local_init_op, ready_for_local_init_op=ready_for_local_init_op, recovery_wait_secs=1, global_step=global_step) else: sv = tf.train.Supervisor( is_chief=is_chief, logdir=train_dir, init_op=init_op, recovery_wait_secs=1, global_step=global_step) sess_config = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False, device_filters=["/job:ps", "/job:worker/task:%d" % task_index]) # The chief worker (task_index==0) session will prepare the session, # while the remaining workers will wait for the preparation to complete. if is_chief: print("Worker %d: Initializing session..." % task_index) else: print("Worker %d: Waiting for session to be initialized..." % task_index) if FLAGS.existing_servers: server_grpc_url = "grpc://" + task_index print("Using existing server at: %s" % server_grpc_url) sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config) else: sess = sv.prepare_or_wait_for_session(server.target, config=sess_config) print("Worker %d: Session initialization complete." % task_index) if FLAGS.sync_replicas and is_chief: # Chief worker will start the chief queue runner and call the init op. sess.run(sync_init_op) sv.start_queue_runners(sess, [chief_queue_runner]) # Perform training time_begin = time.time() print("Training begins @ %f" % time_begin) local_step = 0 while True: # Training feed batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size) train_feed = {x: batch_xs, y_: batch_ys} _, step = sess.run([train_step, global_step], feed_dict=train_feed) local_step += 1 now = time.time() print("%f: Worker %d: training step %d done (global step: %d)" % (now, task_index, local_step, step)) if step >= FLAGS.train_steps: break time_end = time.time() print("Training ends @ %f" % time_end) training_time = time_end - time_begin print("Training elapsed time: %f s" % training_time) # Validation feed val_feed = {x: mnist.validation.images, y_: mnist.validation.labels} val_xent = sess.run(cross_entropy, feed_dict=val_feed) print("After %d training step(s), validation cross entropy = %g" % (FLAGS.train_steps, val_xent)) if job_name == "worker" and task_index == 0: run = Run.get_context() run.log("CrossEntropy", val_xent)
from sklearn.datasets import load_diabetes from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from azureml.core.run import Run from sklearn.externals import joblib import os import numpy as np import mylib os.makedirs('./outputs', exist_ok=True) X, y = load_diabetes(return_X_y=True) run = Run.get_context() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}} # list of numbers from 0.0 to 1.0 with a 0.05 interval alphas = mylib.get_alphas() for alpha in alphas: # Use Ridge algorithm to create a regression model reg = Ridge(alpha=alpha) reg.fit(data["train"]["X"], data["train"]["y"])
def run(args): if args.supress_warnings: warnings.simplefilter("ignore") def adjust_path(p): return os.path.join(args.data_root_dir, p) args.label_encoder = adjust_path(args.label_encoder) args.all_imgs_csv = adjust_path(args.all_imgs_csv) args.val_imgs_csv = adjust_path(args.val_imgs_csv) args.test_imgs_csv = adjust_path(args.test_imgs_csv) args.results_dir = adjust_path(args.results_dir) print(args) from multihead_trainer import train from multihead_trainer import torch_transform # TODO: consolidate logid def build_logid_string(args, add_timestamp=True): param_str = "lr{}_dr{}_lrpatience{}_lrfactor{}_{}".format( args.init_lr, args.dropout, args.lr_patience, args.lr_factor, args.appearance_network) if add_timestamp: param_str += "_" + datetime.datetime.now().strftime("%Y%m%d%H%M") return param_str param_str = build_logid_string(args) # Azure ML from azureml.core.run import Run run = Run.get_context() # log arguments if it's not called by train_cv if not hasattr(args, 'folds_csv_dir'): for k, v in vars(args).items(): run.tag(k, str(v)) save_path = os.path.join(args.results_dir, param_str) os.makedirs(save_path, exist_ok=True) print("save_path", save_path) logger.info( f"cuda.is_available={torch.cuda.is_available()}, n_gpu={torch.cuda.device_count()}" ) # encode the classes from sklearn.preprocessing import LabelEncoder import pickle if not os.path.exists(args.label_encoder): logger.warning(f"Fitting a new label encoder at {args.label_encoder}") all_imgs_df = pd.read_csv(args.all_imgs_csv) label_encoder = LabelEncoder() label_encoder.fit(all_imgs_df['label']) pickle.dump(label_encoder, open(args.label_encoder, "wb")) else: logger.info(f"Loading label encoder: {args.label_encoder}") with open(args.label_encoder, 'rb') as pickle_file: label_encoder = pickle.load(pickle_file) logger.info(f"label_encoder.classes_={label_encoder.classes_}") logger.info("The label encoder has {} classes.".format( len(label_encoder.classes_))) # Load image list all_images_df = pd.read_csv(args.all_imgs_csv) val_df = pd.read_csv(args.val_imgs_csv) test_df = pd.read_csv(args.test_imgs_csv) for df in [all_images_df, val_df, test_df]: df['image_path'] = df['image_path'].apply( lambda x: os.path.join(args.data_root_dir, args.img_dir, x)) val_test_image_paths = list(val_df['image_path'].values) + list( test_df['image_path'].values) train_df = all_images_df[~all_images_df['image_path']. isin(val_test_image_paths)] ref_only_df = train_df[train_df['is_ref']] cons_train_df = train_df[train_df['is_ref'] == False] cons_val_df = val_df print("all_images", len(all_images_df), "train", len(train_df), "val", len(val_df), "test", len(test_df)) run.log("all_images_size", len(all_images_df)) run.log("train_size", len(train_df)) run.log("val_size", len(val_df)) run.log("test_size", len(test_df)) print("ref_only_df", len(ref_only_df), "cons_train_df", len(cons_train_df), "cons_val_df", len(cons_val_df)) import classif_utils classif_utils.ClassificationDataset.set_datadir( os.path.join(args.data_root_dir, args.img_dir)) def plot_pr_curve(plt, dataset_name): run.log_image(name='{}_{}_{}'.format( dataset_name, datetime.datetime.now().strftime("%H:%M:%S"), 'PR-curve'), plot=plt) plt.close() def log_metrics(metrics_results, dataset_name): from metrics import create_prec_inds_str import matplotlib matplotlib.use('Agg') #backend that doesn't display to the user import matplotlib.pyplot as plt import matplotlib.image as mpimg run_metrics = [] for k, v in metrics_results.items(): if ('p_indices' in k) and not ('sanity' in dataset_name): pind_str = create_prec_inds_str(v, label_encoder) run.log("{}_{}".format(dataset_name, k), pind_str) run_metrics.append([ os.path.split(args.val_imgs_csv)[1], dataset_name, k, pind_str ]) elif isinstance(v, (int, float)): run.log("{}_{}".format(dataset_name, k), v) run_metrics.append( [os.path.split(args.val_imgs_csv)[1], dataset_name, k, v]) return run_metrics #if da_train, models is actually a dictionary with F1, F2 and G model, val_metrics = train(ref_only_df, cons_train_df, cons_val_df, label_encoder, torch_transform, 'label', args.batch_size, len(label_encoder.classes_), args, args.max_epochs, results_dir=save_path, add_perspective=args.add_persp_aug) print('completed train()') print('val_metrics', val_metrics) run_metrics_list = log_metrics(val_metrics, 'val') predictions_dfs_list = [] from sanitytest_eval import create_eval_dataloaders evaluator = MetricEmbeddingEvaluator( model, args.metric_simul_sidepairs_eval, sidepairs_agg_method=args.sidepairs_agg, metric_evaluator_type=args.metric_evaluator_type) logit_evaluator = LogitEvaluator(model, args.metric_simul_sidepairs_eval, sidepairs_agg_method=args.sidepairs_agg) #figures out label column for sanity test def get_labelcol_eval(de_imgs_df): #figuring out if it is a pilltype_id or label_prod_code encoder #to set the label column of the sanity test set labels_df = pd.DataFrame({'label': label_encoder.classes_}) img_df = pd.merge(de_imgs_df, labels_df, left_on=['label_prod_code'], right_on=['label'], how='inner') if len(img_df) > 1: labelcol = 'label_prod_code' else: labelcol = 'pilltype_id' print('Selecting {} for sanity test label'.format(labelcol)) return de_imgs_df[labelcol] def test_model(de_imgs_df, evaluator, dataset_name, run_metrics_list, predictions_dfs_list, rotate_aug=None): if rotate_aug is not None: dataset_name += "_rotate_aug{}".format(rotate_aug) print("Evaluating", dataset_name) eval_dataloader, eval_dataset = create_eval_dataloaders( de_imgs_df, label_encoder, torch_transform, 'label', 24, rotate_aug=rotate_aug) ref_dataloader, _ = create_eval_dataloaders(ref_only_df, label_encoder, torch_transform, 'label', 24, rotate_aug=rotate_aug) dataloader = {'ref': ref_dataloader, 'eval': eval_dataloader} device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Eval {}: {} images from {} total images".format( dataset_name, len(eval_dataset), len(de_imgs_df))) metrics_results, predictions = evaluator.eval_model( device, dataloader, do_pr_metrics=True, add_single_side_eval=True) plot_pr_curve(metrics_results['PR-curve'], dataset_name) run_metrics_list += log_metrics(metrics_results, dataset_name) predictions['dataset'] = dataset_name predictions['val_imgs_csv'] = os.path.split(args.val_imgs_csv)[1] predictions_dfs_list.append(predictions) return metrics_results, predictions test_model(test_df, logit_evaluator, 'holdout-logit', run_metrics_list, predictions_dfs_list) test_model(test_df, evaluator, 'holdout', run_metrics_list, predictions_dfs_list) run_metrics_df = pd.DataFrame( run_metrics_list, columns=['val_imgs_csv', 'dataset', 'name', 'value']) all_predictions_df = pd.concat(predictions_dfs_list, ignore_index=True) # make sure to save both for target_save_dir in [save_path, 'outputs']: print(f'saving predictions {target_save_dir}') # TODO: this csv can be large. Update the format for the numpy array of prediction scores. os.makedirs(target_save_dir, exist_ok=True) all_predictions_df.to_csv( os.path.join( target_save_dir, 'eval_predictions_{}'.format( os.path.basename(args.val_imgs_csv)))) torch.save( model.state_dict(), os.path.join(save_path, '{}.pth'.format(os.path.basename(args.val_imgs_csv)))) return run_metrics_df, all_predictions_df
from __future__ import division from __future__ import print_function import sys import os import shutil import argparse import math import tensorflow as tf from azureml.core.run import Run ##### Modified # Get run when running in remote ##### Modified if 'run' not in locals(): ##### Modified run = Run.get_context() ##### Modified FLAGS = None batch_size = 100 # # define functions for Estimator # def _my_input_fn(filepath, num_epochs): # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1] # label - digit (0, 1, ..., 9) data_queue = tf.train.string_input_producer( [filepath], num_epochs=num_epochs ) # data is repeated and it raises OutOfRange when data is over
def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger): current_run = Run.get_submitted_run() parent_run_id = _get_parent_run_id(current_run._run_id) print("[ParentRunId:{}]: Start getting data using dataprep.".format( parent_run_id)) logger.info( "[ParentRunId:{}]: Start getting data using dataprep.".format( parent_run_id)) try: import azureml.train.automl._dataprep_utilities as dataprep_utilities except Exception as e: e.error_type = ErrorTypes.Unclassified log_traceback(e, logger) logger.error(e) raise e fit_iteration_parameters_dict = dict() class RetrieveNumpyArrayError(Exception): def __init__(self): super().__init__() try: print("Resolving Dataflows...") logger.info("Resolving Dataflows...") dataprep_json_obj = json.loads(dataprep_json) if 'activities' in dataprep_json_obj: # json is serialized dataflows dataflow_dict = dataprep_utilities.load_dataflows_from_json( dataprep_json) for k in [ 'X', 'X_valid', 'sample_weight', 'sample_weight_valid' ]: fit_iteration_parameters_dict[ k] = dataprep_utilities.try_retrieve_pandas_dataframe( dataflow_dict.get(k)) for k in ['y', 'y_valid']: try: fit_iteration_parameters_dict[ k] = dataprep_utilities.try_retrieve_numpy_array( dataflow_dict.get(k)) except IndexError: raise RetrieveNumpyArrayError() cv_splits_dataflows = [] i = 0 while 'cv_splits_indices_{0}'.format(i) in dataflow_dict: cv_splits_dataflows.append( dataflow_dict['cv_splits_indices_{0}'.format(i)]) i = i + 1 fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \ else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows) else: # json is dataprep options print('Creating Dataflow from options...\r\nOptions:') logger.info('Creating Dataflow from options...') print(dataprep_json_obj) datastore_name = dataprep_json_obj[ 'datastoreName'] # mandatory data_path = dataprep_json_obj['dataPath'] # mandatory label_column = dataprep_json_obj['label'] # mandatory separator = dataprep_json_obj.get('columnSeparator', ',') header = dataprep_json_obj.get('promoteHeader', True) encoding = dataprep_json_obj.get('encoding', None) quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False) skip_rows = dataprep_json_obj.get('skipRows', 0) feature_columns = dataprep_json_obj.get('features', []) from azureml.core import Datastore import azureml.dataprep as dprep if header: header = dprep.PromoteHeadersMode.CONSTANTGROUPED else: header = dprep.PromoteHeadersMode.NONE try: encoding = dprep.FileEncoding[encoding] except: encoding = dprep.FileEncoding.UTF8 ws = Run.get_context().experiment.workspace datastore = Datastore(ws, datastore_name) dflow = dprep.read_csv(path=datastore.path(data_path), separator=separator, header=header, encoding=encoding, quoting=quoting, skip_rows=skip_rows) if len(feature_columns) == 0: X = dflow.drop_columns(label_column) else: X = dflow.keep_columns(feature_columns) print('Inferring types for feature columns...') logger.info('Inferring types for feature columns...') sct = X.builders.set_column_types() sct.learn() sct.ambiguous_date_conversions_drop() X = sct.to_dataflow() y = dflow.keep_columns(label_column) if automl_settings_obj.task_type.lower() == 'regression': y = y.to_number(label_column) print('X:') print(X) logger.info('X:') logger.info(X) print('y:') print(y) logger.info('y:') logger.info(y) try: from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb _X = try_retrieve_pandas_dataframe_adb(X) fit_iteration_parameters_dict['X'] = _X.values fit_iteration_parameters_dict[ 'x_raw_column_names'] = _X.columns.values except ImportError: logger.info( "SDK version does not support column names extraction, fallback to old path" ) fit_iteration_parameters_dict[ 'X'] = dataprep_utilities.try_retrieve_pandas_dataframe( X) try: fit_iteration_parameters_dict[ 'y'] = dataprep_utilities.try_retrieve_numpy_array(y) except IndexError: raise RetrieveNumpyArrayError() logger.info("Finish getting data using dataprep.") return fit_iteration_parameters_dict except Exception as e: print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}". format(parent_run_id, e.__class__, e)) logger.error( "[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}". format(parent_run_id, e.__class__, e)) if isinstance(e, RetrieveNumpyArrayError): logger.debug("Label column (y) does not exist in user's data.") e.error_type = ErrorTypes.User elif "The provided path is not valid." in str(e): logger.debug("User's data is not accessible from remote run.") e.error_type = ErrorTypes.User elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str( e): logger.debug( "User should use Datastore to data that requires secrets.") e.error_type = ErrorTypes.User else: e.error_type = ErrorTypes.Client log_traceback(e, logger) raise RuntimeError("Error during extracting Dataflows")
def _set_problem_info_for_setup(fit_iteration_parameters_dict, automl_settings_obj, task_type, preprocess, enable_subsampling, num_iterations, logger): current_run = Run.get_submitted_run() logger.info( "Start to set problem info for the setup for run id {}.".format( current_run._run_id)) logger.info("Setup experiment.") try: experiment = current_run.experiment parent_run_id = _get_parent_run_id(current_run._run_id) data_store = experiment.workspace.get_default_datastore() found_data_store = True logger.info("Using data store.") except Exception as e: logger.warning( "Getting data store, fallback to default {}".format(e)) found_data_store = False logger.info("Caching supported {}.".format(sdk_has_cache_capability and found_data_store)) print("caching supported {}".format(sdk_has_cache_capability and found_data_store)) if sdk_has_validate_data_dict: # The newest version of validate_training_data_dict should contains check_x_y logger.info("Using validate_training_data_dict now.") validate_training_data_dict( data_dict=fit_iteration_parameters_dict, automl_settings=automl_settings_obj) else: logger.info("Using validate_training_data now.") validate_training_data( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), sample_weight=fit_iteration_parameters_dict.get( 'sample_weight'), sample_weight_valid=fit_iteration_parameters_dict.get( 'sample_weight_valid'), cv_splits_indices=fit_iteration_parameters_dict.get( 'cv_splits_indices'), automl_settings=automl_settings_obj) check_x_y(fit_iteration_parameters_dict.get('X'), fit_iteration_parameters_dict.get('y'), automl_settings_obj) if sdk_has_cache_capability and found_data_store: data_splits_validated = True try: start = time.time() transformed_data_context = _get_transformed_data_context( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), sample_weight=fit_iteration_parameters_dict.get( 'sample_weight'), sample_weight_valid=fit_iteration_parameters_dict.get( 'sample_weight_valid'), x_raw_column_names=fit_iteration_parameters_dict.get( 'x_raw_column_names'), cv_splits_indices=fit_iteration_parameters_dict.get( 'cv_splits_indices'), automl_settings_obj=automl_settings_obj, data_store=data_store, run_target='remote', parent_run_id=parent_run_id, logger=logger) end = time.time() print("time taken for transform {}".format(end - start)) logger.info("time taken for transform {}".format(end - start)) if sdk_has_validate_data_splits: try: logger.info("Validating data splits now.") _validate_data_splits( X=transformed_data_context.X, y=transformed_data_context.y, X_valid=transformed_data_context.X_valid, y_valid=transformed_data_context.y_valid, cv_splits=transformed_data_context.cv_splits, automl_settings=automl_settings_obj) data_splits_validated = True except Exception as data_split_exception: data_splits_validated = False logger.error("Meeting validation errors {}.".format( data_split_exception)) log_traceback(data_split_exception, logger) raise data_split_exception logger.info("Start setting problem info.") automl.set_problem_info( transformed_data_context.X, transformed_data_context.y, automl_settings_obj.task_type, current_run=current_run, preprocess=automl_settings_obj.preprocess, lag_length=automl_settings_obj.lag_length, transformed_data_context=transformed_data_context, enable_cache=automl_settings_obj.enable_cache, subsampling=enable_subsampling) except Exception as e: if sdk_has_validate_data_splits and not data_splits_validated: logger.error( "sdk_has_validate_data_splits is True and data_splits_validated is False {}." .format(e)) log_traceback(e, logger) raise e else: logger.warning( "Setup failed, fall back to old model {}".format(e)) print("Setup failed, fall back to old model {}".format(e)) automl.set_problem_info( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), task_type=task_type, current_run=current_run, preprocess=preprocess, subsampling=enable_subsampling) else: logger.info("Start setting problem info using old model.") if sdk_has_validate_data_splits: _validate_data_splits( X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), X_valid=fit_iteration_parameters_dict.get('X_valid'), y_valid=fit_iteration_parameters_dict.get('y_valid'), cv_splits=fit_iteration_parameters_dict.get( 'cv_splits_indices'), automl_settings=automl_settings_obj) automl.set_problem_info(X=fit_iteration_parameters_dict.get('X'), y=fit_iteration_parameters_dict.get('y'), task_type=task_type, current_run=current_run, preprocess=preprocess, subsampling=enable_subsampling)
test_size=0.2, random_state=42) vectorizer = CountVectorizer() vectorizer.fit(x_train) X_train = vectorizer.transform(x_train) X_test = vectorizer.transform(x_test) test_data = X_test[1, :] test_data_array = test_data.toarray() test_data_list = test_data_array.tolist() print("len test_data_list", len(test_data_list)) print("len test_data_list 0", len(test_data_list[0])) with open("test_data.txt", "w") as fp: json.dump(test_data_list, fp) run = Run.get_context(allow_offline=True) def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument( "--C", type=float, default=1.0, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument("--max_iter", type=int,
def load_and_clean(dataset_name): dataframe = load_data(dataset_name) dataframe = extract_features(dataframe) features, labels = clean_data(dataframe) return features, labels try: # Get workspace if run locally ws = Workspace.from_config() except: # Get workspace if run remotely ws = Run.get_context().experiment.workspace # Run run = Run.get_context() # Load and clean data features_train, labels_train = load_and_clean('energy-forecast-data-training') features_val, labels_val = load_and_clean('energy-forecast-data-validation') def main(): # Add arguments to script parser = argparse.ArgumentParser() parser.add_argument('--n_estimators', type=int,
def train(config, evaluate_only=False, outdir=".", detail=False, azureml=False): filename = config.model.filename categories_file = config.dataset.categories wav_directory = config.dataset.path batch_size = config.training.batch_size hidden_units = config.model.hidden_units architecture = config.model.architecture num_layers = config.model.num_layers use_gpu = config.training.use_gpu run = None if azureml: from azureml.core.run import Run run = Run.get_context() if run is None: print("### Run.get_context() returned None") else: print("### Running in Azure Context") valid_layers = [1, 2, 3] if num_layers not in valid_layers: raise Exception( "--num_layers can only be one of these values {}".format( valid_layers)) if not os.path.isdir(outdir): os.makedirs(outdir) if not filename: filename = "{}{}KeywordSpotter.pt".format(architecture, hidden_units) config.model.filename = filename # load the featurized data if not os.path.isdir(wav_directory): print("### Error: please specify valid --dataset folder location: {}". format(wav_directory)) sys.exit(1) if not categories_file: categories_file = os.path.join(wav_directory, "categories.txt") with open(categories_file, "r") as f: keywords = [x.strip() for x in f.readlines()] training_file = os.path.join(wav_directory, "training_list.npz") testing_file = os.path.join(wav_directory, "testing_list.npz") validation_file = os.path.join(wav_directory, "validation_list.npz") if not os.path.isfile(training_file): print("Missing file {}".format(training_file)) print("Please run make_datasets.py") sys.exit(1) if not os.path.isfile(validation_file): print("Missing file {}".format(validation_file)) print("Please run make_datasets.py") sys.exit(1) if not os.path.isfile(testing_file): print("Missing file {}".format(testing_file)) print("Please run make_datasets.py") sys.exit(1) model = None device = torch.device("cpu") if use_gpu: if torch.cuda.is_available(): device = torch.device("cuda") else: print("### CUDA not available!!") print("Loading {}...".format(testing_file)) test_data = AudioDataset(testing_file, config.dataset, keywords) log = None if not evaluate_only: print("Loading {}...".format(training_file)) training_data = AudioDataset(training_file, config.dataset, keywords, training=True) print("Loading {}...".format(validation_file)) validation_data = AudioDataset(validation_file, config.dataset, keywords) if training_data.mean is not None: fname = os.path.join(outdir, "mean.npy") print("Saving {}".format(fname)) np.save(fname, training_data.mean) fname = os.path.join(outdir, "std.npy") print("Saving {}".format(fname)) np.save(fname, training_data.std) # use the training_data mean and std variation test_data.mean = training_data.mean test_data.std = training_data.std validation_data.mean = training_data.mean validation_data.std = training_data.std print("Training model {}".format(filename)) model = create_model(config.model, training_data.input_size, training_data.num_keywords) if device.type == 'cuda': model.cuda() # move the processing to GPU start = time.time() log = model.fit(training_data, validation_data, config.training, config.model.sparsify, device, detail, run) end = time.time() passed, total, rate = model.evaluate(training_data, batch_size, device) print("Training accuracy = {:.3f} %".format(rate * 100)) torch.save(model.state_dict(), os.path.join(outdir, filename)) print( "Evaluating {} keyword spotter using {} rows of featurized test audio..." .format(architecture, test_data.num_rows)) if model is None: msg = "Loading trained model with input size {}, hidden units {} and num keywords {}" print( msg.format(test_data.input_size, hidden_units, test_data.num_keywords)) model = create_model(config.model, test_data.input_size, test_data.num_keywords) model.load_dict(torch.load(filename)) if model and device.type == 'cuda': model.cuda() # move the processing to GPU results_file = os.path.join(outdir, "results.txt") passed, total, rate = model.evaluate(test_data, batch_size, device, results_file) print("Testing accuracy = {:.3f} %".format(rate * 100)) if not evaluate_only: name = os.path.splitext(filename)[0] + ".onnx" print("saving onnx file: {}".format(name)) model.export(os.path.join(outdir, name), device) config.dataset.sample_rate = test_data.sample_rate config.dataset.input_size = test_data.audio_size config.dataset.num_filters = test_data.input_size config.dataset.window_size = test_data.window_size config.dataset.shift = test_data.shift logdata = { "accuracy_val": rate, "training_time": end - start, "log": log } d = TrainingConfig.to_dict(config) logdata.update(d) logname = os.path.join(outdir, "train_results.json") save_json(logdata, logname) return rate, log
def main( root_dir: str, input_dir: str, output_dir: str, stop_words: List, timestamp_interval: int, force: bool, ) -> None: """ Main function for receiving args, and passing them through to form recognizer postprocessing function Parameters ---------- root_dir: str Root datastore being used input_dir: str Directory containing input data output_dir: str Path to save outputs to stop_words: Array stop words that will not be counted when computing character level frequency timestamp_interval: int Interval used to split clapperboards into seperate events force: bool Flag that specifies whether current run should overwrite outputs from previous run """ log.info("Clapperboard Selection Step") # Resolve paths input_dir = join(root_dir, input_dir) output_dir = join(root_dir, output_dir) run = Run.get_context() # convert tuple to list type stop_words = list(stop_words) log.info("Checking if output from previous run exists...") if os.path.exists(output_dir) and not force: log.info("Output path already exists, please use --force to overwrite the results. Skipping...") return # set ocr recognizer credentials ocr_credentials = {"key": run.get_secret("ocrkey"), "endpoint": run.get_secret("ocrendpoint")} log.info("Beginning process to select best clapperboards..") # Create directory to store results in os.makedirs(output_dir, exist_ok=True) log.info("Running Clapperboard Selection Step") # Build dataframe image_obj = [] for file in os.listdir(input_dir): if file.endswith(".jpeg"): image = file timestamp = int(Path(file).stem.split("=")[-1]) image_obj.append(dict(image=image, timestamp=timestamp)) output_file = os.path.join(output_dir, "selected_clapperboards.csv") image_df = pd.DataFrame(image_obj) image_df = image_df.sort_values(by=["timestamp"]) image_df["image"] = image_df["image"].apply( lambda x: join(input_dir, x) ) # feed video dataframe into the function to select clapperboards _ = get_best_clapperboard( image_df=image_df, ocr_credentials=ocr_credentials, output_file=output_file, tolerance=timestamp_interval, stop_words=stop_words, ) log.info("Finished Running Clapperboard Selection Step")
def init(): global current_run current_run = Run.get_context()
def main(): num_classes = 3 # create checkpoint dir out_dir = './outputs' if args.out_dir is None else args.out_dir checkpoint_dir = os.path.join(out_dir, experiment_name, 'checkpoints') os.makedirs(checkpoint_dir, exist_ok=True) # write logs to ./logs, which AML uploads to Artifact Service and makes available to a TensorBoard instance. # also log some metrics through AML's Run object run = Run.get_context() logger_train = Logger('train', './logs', run) logger_val = Logger('val', './logs', run) log_sample_img_gt(sample_images_train, sample_images_val, logger_train, logger_val) logging.info('Logged ground truth image samples') # larger model if model_choice == 'unet': model = Unet(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) # year 2 best solution XD_XD's model, as the baseline model elif model_choice == 'unet_baseline': model = UnetBaseline(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True, in_channels=3, is_batchnorm=True) else: sys.exit( 'Invalid model_choice {}, choose unet_baseline or unet'.format( model_choice)) model = model.to(device=device, dtype=dtype) # move the model parameters to CPU/GPU criterion = nn.CrossEntropyLoss(weight=loss_weights).to(device=device, dtype=dtype) # can also use Nesterov momentum in optim.SGD # optimizer = optim.SGD(model.parameters(), lr=learning_rate, # momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=learning_rate) # resume from a checkpoint if provided starting_epoch = 0 best_acc = 0.0 if os.path.isfile(starting_checkpoint_path): logging.info( 'Loading checkpoint from {0}'.format(starting_checkpoint_path)) checkpoint = torch.load(starting_checkpoint_path) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) starting_epoch = checkpoint['epoch'] best_acc = checkpoint.get('best_acc', 0.0) else: logging.info( 'No valid checkpoint is provided. Start to train from scratch...') model.apply(weights_init) if evaluate_only: val_loss, val_acc = evaluate(loader_val, model, criterion) print('Evaluated on val set, loss is {}, accuracy is {}'.format( val_loss, val_acc)) return step = starting_epoch * len(dset_train) for epoch in range(starting_epoch, num_epochs): logging.info('Epoch {} of {}'.format(epoch, num_epochs)) # train for one epoch step = train(loader_train, model, criterion, optimizer, epoch, step, logger_train) # evaluate on val set logging.info( 'Evaluating model on the val set at the end of epoch {}...'.format( epoch)) val_loss, val_acc = evaluate(loader_val, model, criterion) logging.info('\nEpoch {}, val loss is {}, val accuracy is {}\n'.format( epoch, step, val_loss, val_acc)) logger_val.scalar_summary('val_loss', val_loss, step + 1) logger_val.scalar_summary('val_acc', val_acc, step + 1) # TODO log the val images too # record the best accuracy; save checkpoint for every epoch is_best = val_acc > best_acc best_acc = max(val_acc, best_acc) checkpoint_path = os.path.join( checkpoint_dir, 'checkpoint_epoch{}_{}.pth.tar'.format( epoch, strftime("%Y-%m-%d-%H-%M-%S", localtime()))) logging.info( 'Saving to checkoutpoint file at {}. Is it the highest accuracy checkpoint so far: {}' .format(checkpoint_path, str(is_best))) save_checkpoint( { 'epoch': epoch + 1, # saved checkpoints are numbered starting from 1 'arch': model_choice, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_acc': best_acc }, is_best, checkpoint_path, checkpoint_dir)
def main(): # Prepare parser for parameters to tune parser = argparse.ArgumentParser() parser.add_argument( '--learning_rate', type=float, default=0.1, help= "Inverse of regularization strength. Smaller values cause stronger regularization" ) parser.add_argument( '--n_estimators', type=int, default=20, help= "Maximum number of iterations to converge, similar to max_iter in Logistic Regression" ) parser.add_argument( '--max_features', type=int, default=5, help= "Number of features to consider in one pass, i.e. how large could the tree grow" ) parser.add_argument( '--max_depth', type=int, default=2, help="Maximum number of splits, i.e. how bushy could the tree grow") args = parser.parse_args() # Prepare the dataset to match the expected format path = 'https://raw.githubusercontent.com/allaccountstaken/automl_v_hyperdrive/main/data/camel_data_after2010Q3.csv' ds = load_data(path) X, y = clean_data(ds) #Consider for internal datasets: #from azureml.data.dataset_factory import TabularDatasetFactory #ds = TabularDatasetFactory.from_delimited_files(path) # Perorm train-test-split X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), train_size=0.7, random_state=123) # Additionally consider scalling (this may not be important for tree-based models) scaler = StandardScaler() X_scaler = scaler.fit(X_train) X_train_scaled = X_scaler.transform(X_train) X_test_scaled = X_scaler.transform(X_test) # Prepare Azure run context to log tunning progress run = Run.get_context() run.log("Learning rate:", np.float(args.learning_rate)) run.log("Number of estimators:", np.int(args.n_estimators)) run.log("Number of features:", np.int(args.max_features)) run.log("Max tree depth:", np.int(args.max_depth)) # Instanciated and fit GBM classifier using sklearn library model = GradientBoostingClassifier(learning_rate=args.learning_rate, n_estimators=args.n_estimators, max_features=args.max_features, max_depth=args.max_depth, random_state=123) model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) # Report performance metrics of the trained model using testing subset recall = recall_score(y_test, y_pred, average='binary') run.log("Recall", np.round(np.float(recall), 5))