def main(unused_argv):
    data_root = os.path.join("outputs", "MNIST")
    mnist = None
    tf_config = os.environ.get("TF_CONFIG")
    if not tf_config or tf_config == "":
        raise ValueError("TF_CONFIG not found.")
    tf_config_json = json.loads(tf_config)
    cluster = tf_config_json.get('cluster')
    job_name = tf_config_json.get('task', {}).get('type')
    task_index = tf_config_json.get('task', {}).get('index')
    job_name = "worker" if job_name == "master" else job_name
    sentinel_path = os.path.join(data_root, "complete.txt")
    if job_name == "worker" and task_index == 0:
        mnist = input_data.read_data_sets(data_root, one_hot=True)
        with open(sentinel_path, 'w+') as f:
            f.write("download complete")
    else:
        while not os.path.exists(sentinel_path):
            time.sleep(0.01)
        mnist = input_data.read_data_sets(data_root, one_hot=True)

    if FLAGS.download_only:
        sys.exit(0)

    print("job name = %s" % job_name)
    print("task index = %d" % task_index)
    print("number of GPUs = %d" % FLAGS.num_gpus)

    # Construct the cluster and start the server
    cluster_spec = tf.train.ClusterSpec(cluster)

    # Get the number of workers.
    num_workers = len(cluster_spec.task_indices("worker"))

    if not FLAGS.existing_servers:
        # Not using existing servers. Create an in-process server.
        server = tf.train.Server(
            cluster_spec, job_name=job_name, task_index=task_index)
        if job_name == "ps":
            server.join()

    is_chief = (task_index == 0)
    if FLAGS.num_gpus > 0:
        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
        # for each worker in the corresponding machine
        gpu = (task_index % FLAGS.num_gpus)
        worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu)
    elif FLAGS.num_gpus == 0:
        # Just allocate the CPU to worker server
        cpu = 0
        worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu)
    # The device setter will automatically place Variables ops on separate
    # parameter servers (ps). The non-Variable ops will be placed on the workers.
    # The ps use CPU and workers use corresponding GPU
    with tf.device(
        tf.train.replica_device_setter(
            worker_device=worker_device,
            ps_device="/job:ps/cpu:0",
            cluster=cluster)):
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Variables of the hidden layer
        hid_w = tf.Variable(
            tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
            name="hid_w")
        hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")

        # Variables of the softmax layer
        sm_w = tf.Variable(
            tf.truncated_normal(
                [FLAGS.hidden_units, 10],
                stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
            name="sm_w")
        sm_b = tf.Variable(tf.zeros([10]), name="sm_b")

        # Ops: located on the worker specified with task_index
        x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
        y_ = tf.placeholder(tf.float32, [None, 10])

        hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
        hid = tf.nn.relu(hid_lin)

        y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
        cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

        opt = tf.train.AdamOptimizer(FLAGS.learning_rate)

        if FLAGS.sync_replicas:
            if FLAGS.replicas_to_aggregate is None:
                replicas_to_aggregate = num_workers
            else:
                replicas_to_aggregate = FLAGS.replicas_to_aggregate

            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=replicas_to_aggregate,
                total_num_replicas=num_workers,
                name="mnist_sync_replicas")

        train_step = opt.minimize(cross_entropy, global_step=global_step)

        if FLAGS.sync_replicas:
            local_init_op = opt.local_step_init_op
            if is_chief:
                local_init_op = opt.chief_init_op

            ready_for_local_init_op = opt.ready_for_local_init_op

            # Initial token and chief queue runners required by the sync_replicas mode
            chief_queue_runner = opt.get_chief_queue_runner()
            sync_init_op = opt.get_init_tokens_op()

        init_op = tf.global_variables_initializer()
        train_dir = tempfile.mkdtemp()

        if FLAGS.sync_replicas:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                ready_for_local_init_op=ready_for_local_init_op,
                recovery_wait_secs=1,
                global_step=global_step)
        else:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                recovery_wait_secs=1,
                global_step=global_step)

        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=["/job:ps",
                            "/job:worker/task:%d" % task_index])

        # The chief worker (task_index==0) session will prepare the session,
        # while the remaining workers will wait for the preparation to complete.
        if is_chief:
            print("Worker %d: Initializing session..." % task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  task_index)

        if FLAGS.existing_servers:
            server_grpc_url = "grpc://" + task_index
            print("Using existing server at: %s" % server_grpc_url)

            sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
        else:
            sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

        print("Worker %d: Session initialization complete." % task_index)

        if FLAGS.sync_replicas and is_chief:
            # Chief worker will start the chief queue runner and call the init op.
            sess.run(sync_init_op)
            sv.start_queue_runners(sess, [chief_queue_runner])

        # Perform training
        time_begin = time.time()
        print("Training begins @ %f" % time_begin)

        local_step = 0
        while True:
            # Training feed
            batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
            train_feed = {x: batch_xs, y_: batch_ys}

            _, step = sess.run([train_step, global_step], feed_dict=train_feed)
            local_step += 1

            now = time.time()
            print("%f: Worker %d: training step %d done (global step: %d)" %
                  (now, task_index, local_step, step))

            if step >= FLAGS.train_steps:
                break

        time_end = time.time()
        print("Training ends @ %f" % time_end)
        training_time = time_end - time_begin
        print("Training elapsed time: %f s" % training_time)

        # Validation feed
        val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
        val_xent = sess.run(cross_entropy, feed_dict=val_feed)
        print("After %d training step(s), validation cross entropy = %g" %
              (FLAGS.train_steps, val_xent))
        if job_name == "worker" and task_index == 0:
            run = Run.get_context()
            run.log("CrossEntropy", val_xent)
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from azureml.core.run import Run
from sklearn.externals import joblib
import os
import numpy as np
import mylib

os.makedirs('./outputs', exist_ok=True)

X, y = load_diabetes(return_X_y=True)

run = Run.get_context()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0)
data = {"train": {"X": X_train, "y": y_train},
        "test": {"X": X_test, "y": y_test}}

# list of numbers from 0.0 to 1.0 with a 0.05 interval
alphas = mylib.get_alphas()

for alpha in alphas:
    # Use Ridge algorithm to create a regression model
    reg = Ridge(alpha=alpha)
    reg.fit(data["train"]["X"], data["train"]["y"])
def run(args):
    if args.supress_warnings:
        warnings.simplefilter("ignore")

    def adjust_path(p):
        return os.path.join(args.data_root_dir, p)

    args.label_encoder = adjust_path(args.label_encoder)
    args.all_imgs_csv = adjust_path(args.all_imgs_csv)
    args.val_imgs_csv = adjust_path(args.val_imgs_csv)
    args.test_imgs_csv = adjust_path(args.test_imgs_csv)
    args.results_dir = adjust_path(args.results_dir)

    print(args)

    from multihead_trainer import train
    from multihead_trainer import torch_transform

    # TODO: consolidate logid
    def build_logid_string(args, add_timestamp=True):
        param_str = "lr{}_dr{}_lrpatience{}_lrfactor{}_{}".format(
            args.init_lr, args.dropout, args.lr_patience, args.lr_factor,
            args.appearance_network)

        if add_timestamp:
            param_str += "_" + datetime.datetime.now().strftime("%Y%m%d%H%M")

        return param_str

    param_str = build_logid_string(args)

    # Azure ML
    from azureml.core.run import Run
    run = Run.get_context()

    # log arguments if it's not called by train_cv
    if not hasattr(args, 'folds_csv_dir'):
        for k, v in vars(args).items():
            run.tag(k, str(v))

    save_path = os.path.join(args.results_dir, param_str)
    os.makedirs(save_path, exist_ok=True)
    print("save_path", save_path)

    logger.info(
        f"cuda.is_available={torch.cuda.is_available()}, n_gpu={torch.cuda.device_count()}"
    )

    # encode the classes
    from sklearn.preprocessing import LabelEncoder

    import pickle
    if not os.path.exists(args.label_encoder):
        logger.warning(f"Fitting a new label encoder at {args.label_encoder}")

        all_imgs_df = pd.read_csv(args.all_imgs_csv)

        label_encoder = LabelEncoder()
        label_encoder.fit(all_imgs_df['label'])

        pickle.dump(label_encoder, open(args.label_encoder, "wb"))

    else:
        logger.info(f"Loading label encoder: {args.label_encoder}")

        with open(args.label_encoder, 'rb') as pickle_file:
            label_encoder = pickle.load(pickle_file)

    logger.info(f"label_encoder.classes_={label_encoder.classes_}")
    logger.info("The label encoder has {} classes.".format(
        len(label_encoder.classes_)))

    # Load image list
    all_images_df = pd.read_csv(args.all_imgs_csv)
    val_df = pd.read_csv(args.val_imgs_csv)
    test_df = pd.read_csv(args.test_imgs_csv)

    for df in [all_images_df, val_df, test_df]:
        df['image_path'] = df['image_path'].apply(
            lambda x: os.path.join(args.data_root_dir, args.img_dir, x))

    val_test_image_paths = list(val_df['image_path'].values) + list(
        test_df['image_path'].values)
    train_df = all_images_df[~all_images_df['image_path'].
                             isin(val_test_image_paths)]

    ref_only_df = train_df[train_df['is_ref']]
    cons_train_df = train_df[train_df['is_ref'] == False]
    cons_val_df = val_df

    print("all_images", len(all_images_df), "train", len(train_df), "val",
          len(val_df), "test", len(test_df))
    run.log("all_images_size", len(all_images_df))
    run.log("train_size", len(train_df))
    run.log("val_size", len(val_df))
    run.log("test_size", len(test_df))

    print("ref_only_df", len(ref_only_df), "cons_train_df", len(cons_train_df),
          "cons_val_df", len(cons_val_df))

    import classif_utils
    classif_utils.ClassificationDataset.set_datadir(
        os.path.join(args.data_root_dir, args.img_dir))

    def plot_pr_curve(plt, dataset_name):
        run.log_image(name='{}_{}_{}'.format(
            dataset_name,
            datetime.datetime.now().strftime("%H:%M:%S"), 'PR-curve'),
                      plot=plt)
        plt.close()

    def log_metrics(metrics_results, dataset_name):
        from metrics import create_prec_inds_str
        import matplotlib
        matplotlib.use('Agg')  #backend that doesn't display to the user
        import matplotlib.pyplot as plt
        import matplotlib.image as mpimg

        run_metrics = []

        for k, v in metrics_results.items():
            if ('p_indices' in k) and not ('sanity' in dataset_name):
                pind_str = create_prec_inds_str(v, label_encoder)

                run.log("{}_{}".format(dataset_name, k), pind_str)
                run_metrics.append([
                    os.path.split(args.val_imgs_csv)[1], dataset_name, k,
                    pind_str
                ])

            elif isinstance(v, (int, float)):
                run.log("{}_{}".format(dataset_name, k), v)
                run_metrics.append(
                    [os.path.split(args.val_imgs_csv)[1], dataset_name, k, v])

        return run_metrics

    #if da_train, models is actually a dictionary with F1, F2 and G
    model, val_metrics = train(ref_only_df,
                               cons_train_df,
                               cons_val_df,
                               label_encoder,
                               torch_transform,
                               'label',
                               args.batch_size,
                               len(label_encoder.classes_),
                               args,
                               args.max_epochs,
                               results_dir=save_path,
                               add_perspective=args.add_persp_aug)

    print('completed train()')
    print('val_metrics', val_metrics)

    run_metrics_list = log_metrics(val_metrics, 'val')
    predictions_dfs_list = []

    from sanitytest_eval import create_eval_dataloaders

    evaluator = MetricEmbeddingEvaluator(
        model,
        args.metric_simul_sidepairs_eval,
        sidepairs_agg_method=args.sidepairs_agg,
        metric_evaluator_type=args.metric_evaluator_type)

    logit_evaluator = LogitEvaluator(model,
                                     args.metric_simul_sidepairs_eval,
                                     sidepairs_agg_method=args.sidepairs_agg)

    #figures out label column for sanity test
    def get_labelcol_eval(de_imgs_df):

        #figuring out if it is a pilltype_id or label_prod_code encoder
        #to set the label column of the sanity test set
        labels_df = pd.DataFrame({'label': label_encoder.classes_})
        img_df = pd.merge(de_imgs_df,
                          labels_df,
                          left_on=['label_prod_code'],
                          right_on=['label'],
                          how='inner')

        if len(img_df) > 1:
            labelcol = 'label_prod_code'
        else:
            labelcol = 'pilltype_id'
        print('Selecting {} for sanity test label'.format(labelcol))

        return de_imgs_df[labelcol]

    def test_model(de_imgs_df,
                   evaluator,
                   dataset_name,
                   run_metrics_list,
                   predictions_dfs_list,
                   rotate_aug=None):
        if rotate_aug is not None:
            dataset_name += "_rotate_aug{}".format(rotate_aug)

        print("Evaluating", dataset_name)
        eval_dataloader, eval_dataset = create_eval_dataloaders(
            de_imgs_df,
            label_encoder,
            torch_transform,
            'label',
            24,
            rotate_aug=rotate_aug)

        ref_dataloader, _ = create_eval_dataloaders(ref_only_df,
                                                    label_encoder,
                                                    torch_transform,
                                                    'label',
                                                    24,
                                                    rotate_aug=rotate_aug)
        dataloader = {'ref': ref_dataloader, 'eval': eval_dataloader}

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        print("Eval {}: {} images from {} total images".format(
            dataset_name, len(eval_dataset), len(de_imgs_df)))

        metrics_results, predictions = evaluator.eval_model(
            device, dataloader, do_pr_metrics=True, add_single_side_eval=True)

        plot_pr_curve(metrics_results['PR-curve'], dataset_name)

        run_metrics_list += log_metrics(metrics_results, dataset_name)

        predictions['dataset'] = dataset_name
        predictions['val_imgs_csv'] = os.path.split(args.val_imgs_csv)[1]
        predictions_dfs_list.append(predictions)

        return metrics_results, predictions

    test_model(test_df, logit_evaluator, 'holdout-logit', run_metrics_list,
               predictions_dfs_list)
    test_model(test_df, evaluator, 'holdout', run_metrics_list,
               predictions_dfs_list)

    run_metrics_df = pd.DataFrame(
        run_metrics_list, columns=['val_imgs_csv', 'dataset', 'name', 'value'])
    all_predictions_df = pd.concat(predictions_dfs_list, ignore_index=True)

    # make sure to save both
    for target_save_dir in [save_path, 'outputs']:
        print(f'saving predictions {target_save_dir}')
        # TODO: this csv can be large. Update the format for the numpy array of prediction scores.
        os.makedirs(target_save_dir, exist_ok=True)
        all_predictions_df.to_csv(
            os.path.join(
                target_save_dir, 'eval_predictions_{}'.format(
                    os.path.basename(args.val_imgs_csv))))

    torch.save(
        model.state_dict(),
        os.path.join(save_path,
                     '{}.pth'.format(os.path.basename(args.val_imgs_csv))))

    return run_metrics_df, all_predictions_df
Beispiel #4
0
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

from azureml.core.run import Run  ##### Modified

# Get run when running in remote ##### Modified
if 'run' not in locals():  ##### Modified
    run = Run.get_context()  ##### Modified

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#


def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath], num_epochs=num_epochs
    )  # data is repeated and it raises OutOfRange when data is over
    def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger):
        current_run = Run.get_submitted_run()
        parent_run_id = _get_parent_run_id(current_run._run_id)
        print("[ParentRunId:{}]: Start getting data using dataprep.".format(
            parent_run_id))
        logger.info(
            "[ParentRunId:{}]: Start getting data using dataprep.".format(
                parent_run_id))
        try:
            import azureml.train.automl._dataprep_utilities as dataprep_utilities
        except Exception as e:
            e.error_type = ErrorTypes.Unclassified
            log_traceback(e, logger)
            logger.error(e)
            raise e

        fit_iteration_parameters_dict = dict()

        class RetrieveNumpyArrayError(Exception):
            def __init__(self):
                super().__init__()

        try:
            print("Resolving Dataflows...")
            logger.info("Resolving Dataflows...")
            dataprep_json_obj = json.loads(dataprep_json)
            if 'activities' in dataprep_json_obj:  # json is serialized dataflows
                dataflow_dict = dataprep_utilities.load_dataflows_from_json(
                    dataprep_json)
                for k in [
                        'X', 'X_valid', 'sample_weight', 'sample_weight_valid'
                ]:
                    fit_iteration_parameters_dict[
                        k] = dataprep_utilities.try_retrieve_pandas_dataframe(
                            dataflow_dict.get(k))
                for k in ['y', 'y_valid']:
                    try:
                        fit_iteration_parameters_dict[
                            k] = dataprep_utilities.try_retrieve_numpy_array(
                                dataflow_dict.get(k))
                    except IndexError:
                        raise RetrieveNumpyArrayError()

                cv_splits_dataflows = []
                i = 0
                while 'cv_splits_indices_{0}'.format(i) in dataflow_dict:
                    cv_splits_dataflows.append(
                        dataflow_dict['cv_splits_indices_{0}'.format(i)])
                    i = i + 1
                fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \
                    else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows)
            else:  # json is dataprep options
                print('Creating Dataflow from options...\r\nOptions:')
                logger.info('Creating Dataflow from options...')
                print(dataprep_json_obj)
                datastore_name = dataprep_json_obj[
                    'datastoreName']  # mandatory
                data_path = dataprep_json_obj['dataPath']  # mandatory
                label_column = dataprep_json_obj['label']  # mandatory
                separator = dataprep_json_obj.get('columnSeparator', ',')
                header = dataprep_json_obj.get('promoteHeader', True)
                encoding = dataprep_json_obj.get('encoding', None)
                quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False)
                skip_rows = dataprep_json_obj.get('skipRows', 0)
                feature_columns = dataprep_json_obj.get('features', [])

                from azureml.core import Datastore
                import azureml.dataprep as dprep
                if header:
                    header = dprep.PromoteHeadersMode.CONSTANTGROUPED
                else:
                    header = dprep.PromoteHeadersMode.NONE
                try:
                    encoding = dprep.FileEncoding[encoding]
                except:
                    encoding = dprep.FileEncoding.UTF8

                ws = Run.get_context().experiment.workspace
                datastore = Datastore(ws, datastore_name)
                dflow = dprep.read_csv(path=datastore.path(data_path),
                                       separator=separator,
                                       header=header,
                                       encoding=encoding,
                                       quoting=quoting,
                                       skip_rows=skip_rows)

                if len(feature_columns) == 0:
                    X = dflow.drop_columns(label_column)
                else:
                    X = dflow.keep_columns(feature_columns)

                print('Inferring types for feature columns...')
                logger.info('Inferring types for feature columns...')
                sct = X.builders.set_column_types()
                sct.learn()
                sct.ambiguous_date_conversions_drop()
                X = sct.to_dataflow()

                y = dflow.keep_columns(label_column)
                if automl_settings_obj.task_type.lower() == 'regression':
                    y = y.to_number(label_column)

                print('X:')
                print(X)
                logger.info('X:')
                logger.info(X)

                print('y:')
                print(y)
                logger.info('y:')
                logger.info(y)

                try:
                    from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb
                    _X = try_retrieve_pandas_dataframe_adb(X)
                    fit_iteration_parameters_dict['X'] = _X.values
                    fit_iteration_parameters_dict[
                        'x_raw_column_names'] = _X.columns.values
                except ImportError:
                    logger.info(
                        "SDK version does not support column names extraction, fallback to old path"
                    )
                    fit_iteration_parameters_dict[
                        'X'] = dataprep_utilities.try_retrieve_pandas_dataframe(
                            X)

                try:
                    fit_iteration_parameters_dict[
                        'y'] = dataprep_utilities.try_retrieve_numpy_array(y)
                except IndexError:
                    raise RetrieveNumpyArrayError()

            logger.info("Finish getting data using dataprep.")
            return fit_iteration_parameters_dict
        except Exception as e:
            print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".
                  format(parent_run_id, e.__class__, e))
            logger.error(
                "[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".
                format(parent_run_id, e.__class__, e))
            if isinstance(e, RetrieveNumpyArrayError):
                logger.debug("Label column (y) does not exist in user's data.")
                e.error_type = ErrorTypes.User
            elif "The provided path is not valid." in str(e):
                logger.debug("User's data is not accessible from remote run.")
                e.error_type = ErrorTypes.User
            elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(
                    e):
                logger.debug(
                    "User should use Datastore to data that requires secrets.")
                e.error_type = ErrorTypes.User
            else:
                e.error_type = ErrorTypes.Client
            log_traceback(e, logger)
            raise RuntimeError("Error during extracting Dataflows")
    def _set_problem_info_for_setup(fit_iteration_parameters_dict,
                                    automl_settings_obj, task_type, preprocess,
                                    enable_subsampling, num_iterations,
                                    logger):
        current_run = Run.get_submitted_run()
        logger.info(
            "Start to set problem info for the setup for run id {}.".format(
                current_run._run_id))
        logger.info("Setup experiment.")
        try:
            experiment = current_run.experiment
            parent_run_id = _get_parent_run_id(current_run._run_id)
            data_store = experiment.workspace.get_default_datastore()
            found_data_store = True
            logger.info("Using data store.")
        except Exception as e:
            logger.warning(
                "Getting data store, fallback to default {}".format(e))
            found_data_store = False

        logger.info("Caching supported {}.".format(sdk_has_cache_capability
                                                   and found_data_store))
        print("caching supported {}".format(sdk_has_cache_capability
                                            and found_data_store))
        if sdk_has_validate_data_dict:
            # The newest version of validate_training_data_dict should contains check_x_y
            logger.info("Using validate_training_data_dict now.")
            validate_training_data_dict(
                data_dict=fit_iteration_parameters_dict,
                automl_settings=automl_settings_obj)
        else:
            logger.info("Using validate_training_data now.")
            validate_training_data(
                X=fit_iteration_parameters_dict.get('X'),
                y=fit_iteration_parameters_dict.get('y'),
                X_valid=fit_iteration_parameters_dict.get('X_valid'),
                y_valid=fit_iteration_parameters_dict.get('y_valid'),
                sample_weight=fit_iteration_parameters_dict.get(
                    'sample_weight'),
                sample_weight_valid=fit_iteration_parameters_dict.get(
                    'sample_weight_valid'),
                cv_splits_indices=fit_iteration_parameters_dict.get(
                    'cv_splits_indices'),
                automl_settings=automl_settings_obj)
            check_x_y(fit_iteration_parameters_dict.get('X'),
                      fit_iteration_parameters_dict.get('y'),
                      automl_settings_obj)
        if sdk_has_cache_capability and found_data_store:
            data_splits_validated = True
            try:
                start = time.time()
                transformed_data_context = _get_transformed_data_context(
                    X=fit_iteration_parameters_dict.get('X'),
                    y=fit_iteration_parameters_dict.get('y'),
                    X_valid=fit_iteration_parameters_dict.get('X_valid'),
                    y_valid=fit_iteration_parameters_dict.get('y_valid'),
                    sample_weight=fit_iteration_parameters_dict.get(
                        'sample_weight'),
                    sample_weight_valid=fit_iteration_parameters_dict.get(
                        'sample_weight_valid'),
                    x_raw_column_names=fit_iteration_parameters_dict.get(
                        'x_raw_column_names'),
                    cv_splits_indices=fit_iteration_parameters_dict.get(
                        'cv_splits_indices'),
                    automl_settings_obj=automl_settings_obj,
                    data_store=data_store,
                    run_target='remote',
                    parent_run_id=parent_run_id,
                    logger=logger)
                end = time.time()
                print("time taken for transform {}".format(end - start))
                logger.info("time taken for transform {}".format(end - start))
                if sdk_has_validate_data_splits:
                    try:
                        logger.info("Validating data splits now.")
                        _validate_data_splits(
                            X=transformed_data_context.X,
                            y=transformed_data_context.y,
                            X_valid=transformed_data_context.X_valid,
                            y_valid=transformed_data_context.y_valid,
                            cv_splits=transformed_data_context.cv_splits,
                            automl_settings=automl_settings_obj)
                        data_splits_validated = True
                    except Exception as data_split_exception:
                        data_splits_validated = False
                        logger.error("Meeting validation errors {}.".format(
                            data_split_exception))
                        log_traceback(data_split_exception, logger)
                        raise data_split_exception
                logger.info("Start setting problem info.")
                automl.set_problem_info(
                    transformed_data_context.X,
                    transformed_data_context.y,
                    automl_settings_obj.task_type,
                    current_run=current_run,
                    preprocess=automl_settings_obj.preprocess,
                    lag_length=automl_settings_obj.lag_length,
                    transformed_data_context=transformed_data_context,
                    enable_cache=automl_settings_obj.enable_cache,
                    subsampling=enable_subsampling)
            except Exception as e:
                if sdk_has_validate_data_splits and not data_splits_validated:
                    logger.error(
                        "sdk_has_validate_data_splits is True and data_splits_validated is False {}."
                        .format(e))
                    log_traceback(e, logger)
                    raise e
                else:
                    logger.warning(
                        "Setup failed, fall back to old model {}".format(e))
                    print("Setup failed, fall back to old model {}".format(e))
                    automl.set_problem_info(
                        X=fit_iteration_parameters_dict.get('X'),
                        y=fit_iteration_parameters_dict.get('y'),
                        task_type=task_type,
                        current_run=current_run,
                        preprocess=preprocess,
                        subsampling=enable_subsampling)
        else:
            logger.info("Start setting problem info using old model.")
            if sdk_has_validate_data_splits:
                _validate_data_splits(
                    X=fit_iteration_parameters_dict.get('X'),
                    y=fit_iteration_parameters_dict.get('y'),
                    X_valid=fit_iteration_parameters_dict.get('X_valid'),
                    y_valid=fit_iteration_parameters_dict.get('y_valid'),
                    cv_splits=fit_iteration_parameters_dict.get(
                        'cv_splits_indices'),
                    automl_settings=automl_settings_obj)
            automl.set_problem_info(X=fit_iteration_parameters_dict.get('X'),
                                    y=fit_iteration_parameters_dict.get('y'),
                                    task_type=task_type,
                                    current_run=current_run,
                                    preprocess=preprocess,
                                    subsampling=enable_subsampling)
                                                    test_size=0.2,
                                                    random_state=42)
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)
test_data = X_test[1, :]
test_data_array = test_data.toarray()
test_data_list = test_data_array.tolist()
print("len test_data_list", len(test_data_list))
print("len test_data_list 0", len(test_data_list[0]))
with open("test_data.txt", "w") as fp:
    json.dump(test_data_list, fp)

run = Run.get_context(allow_offline=True)


def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--C",
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument("--max_iter",
                        type=int,

def load_and_clean(dataset_name):
    dataframe = load_data(dataset_name)
    dataframe = extract_features(dataframe)
    features, labels = clean_data(dataframe)

    return features, labels


try:
    # Get workspace if run locally
    ws = Workspace.from_config()
except:
    # Get workspace if run remotely
    ws = Run.get_context().experiment.workspace

# Run
run = Run.get_context()

# Load and clean data
features_train, labels_train = load_and_clean('energy-forecast-data-training')
features_val, labels_val = load_and_clean('energy-forecast-data-validation')


def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators',
                        type=int,
Beispiel #9
0
def train(config,
          evaluate_only=False,
          outdir=".",
          detail=False,
          azureml=False):

    filename = config.model.filename
    categories_file = config.dataset.categories
    wav_directory = config.dataset.path
    batch_size = config.training.batch_size
    hidden_units = config.model.hidden_units
    architecture = config.model.architecture
    num_layers = config.model.num_layers
    use_gpu = config.training.use_gpu

    run = None

    if azureml:
        from azureml.core.run import Run
        run = Run.get_context()
        if run is None:
            print("### Run.get_context() returned None")
        else:
            print("### Running in Azure Context")

    valid_layers = [1, 2, 3]
    if num_layers not in valid_layers:
        raise Exception(
            "--num_layers can only be one of these values {}".format(
                valid_layers))

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    if not filename:
        filename = "{}{}KeywordSpotter.pt".format(architecture, hidden_units)
        config.model.filename = filename

    # load the featurized data
    if not os.path.isdir(wav_directory):
        print("### Error: please specify valid --dataset folder location: {}".
              format(wav_directory))
        sys.exit(1)

    if not categories_file:
        categories_file = os.path.join(wav_directory, "categories.txt")

    with open(categories_file, "r") as f:
        keywords = [x.strip() for x in f.readlines()]

    training_file = os.path.join(wav_directory, "training_list.npz")
    testing_file = os.path.join(wav_directory, "testing_list.npz")
    validation_file = os.path.join(wav_directory, "validation_list.npz")

    if not os.path.isfile(training_file):
        print("Missing file {}".format(training_file))
        print("Please run make_datasets.py")
        sys.exit(1)
    if not os.path.isfile(validation_file):
        print("Missing file {}".format(validation_file))
        print("Please run make_datasets.py")
        sys.exit(1)
    if not os.path.isfile(testing_file):
        print("Missing file {}".format(testing_file))
        print("Please run make_datasets.py")
        sys.exit(1)

    model = None

    device = torch.device("cpu")
    if use_gpu:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            print("### CUDA not available!!")

    print("Loading {}...".format(testing_file))
    test_data = AudioDataset(testing_file, config.dataset, keywords)

    log = None
    if not evaluate_only:
        print("Loading {}...".format(training_file))
        training_data = AudioDataset(training_file,
                                     config.dataset,
                                     keywords,
                                     training=True)

        print("Loading {}...".format(validation_file))
        validation_data = AudioDataset(validation_file, config.dataset,
                                       keywords)

        if training_data.mean is not None:
            fname = os.path.join(outdir, "mean.npy")
            print("Saving {}".format(fname))
            np.save(fname, training_data.mean)
            fname = os.path.join(outdir, "std.npy")
            print("Saving {}".format(fname))
            np.save(fname, training_data.std)

            # use the training_data mean and std variation
            test_data.mean = training_data.mean
            test_data.std = training_data.std
            validation_data.mean = training_data.mean
            validation_data.std = training_data.std

        print("Training model {}".format(filename))
        model = create_model(config.model, training_data.input_size,
                             training_data.num_keywords)
        if device.type == 'cuda':
            model.cuda()  # move the processing to GPU

        start = time.time()
        log = model.fit(training_data, validation_data, config.training,
                        config.model.sparsify, device, detail, run)
        end = time.time()

        passed, total, rate = model.evaluate(training_data, batch_size, device)
        print("Training accuracy = {:.3f} %".format(rate * 100))

        torch.save(model.state_dict(), os.path.join(outdir, filename))

    print(
        "Evaluating {} keyword spotter using {} rows of featurized test audio..."
        .format(architecture, test_data.num_rows))
    if model is None:
        msg = "Loading trained model with input size {}, hidden units {} and num keywords {}"
        print(
            msg.format(test_data.input_size, hidden_units,
                       test_data.num_keywords))
        model = create_model(config.model, test_data.input_size,
                             test_data.num_keywords)
        model.load_dict(torch.load(filename))
        if model and device.type == 'cuda':
            model.cuda()  # move the processing to GPU

    results_file = os.path.join(outdir, "results.txt")
    passed, total, rate = model.evaluate(test_data, batch_size, device,
                                         results_file)
    print("Testing accuracy = {:.3f} %".format(rate * 100))

    if not evaluate_only:
        name = os.path.splitext(filename)[0] + ".onnx"
        print("saving onnx file: {}".format(name))
        model.export(os.path.join(outdir, name), device)

        config.dataset.sample_rate = test_data.sample_rate
        config.dataset.input_size = test_data.audio_size
        config.dataset.num_filters = test_data.input_size
        config.dataset.window_size = test_data.window_size
        config.dataset.shift = test_data.shift

        logdata = {
            "accuracy_val": rate,
            "training_time": end - start,
            "log": log
        }
        d = TrainingConfig.to_dict(config)
        logdata.update(d)

        logname = os.path.join(outdir, "train_results.json")
        save_json(logdata, logname)

    return rate, log
def main(
    root_dir: str,
    input_dir: str,
    output_dir: str,
    stop_words: List,
    timestamp_interval: int,
    force: bool,
) -> None:
    """
    Main function for receiving args, and passing them through to form recognizer postprocessing function

    Parameters
    ----------
    root_dir: str
        Root datastore being used
    input_dir: str
        Directory containing input data
    output_dir: str
        Path to save outputs to
    stop_words: Array
        stop words that will not be counted when computing
        character level frequency
    timestamp_interval: int
        Interval used to split clapperboards into seperate events
    force: bool
        Flag that specifies whether current run
        should overwrite outputs from previous run
    """
    log.info("Clapperboard Selection Step")

    # Resolve paths
    input_dir = join(root_dir, input_dir)
    output_dir = join(root_dir, output_dir)

    run = Run.get_context()

    # convert tuple to list type
    stop_words = list(stop_words)

    log.info("Checking if output from previous run exists...")
    if os.path.exists(output_dir) and not force:
        log.info("Output path already exists, please use --force to overwrite the results. Skipping...")
        return

    # set ocr recognizer credentials
    ocr_credentials = {"key": run.get_secret("ocrkey"),
                       "endpoint": run.get_secret("ocrendpoint")}

    log.info("Beginning process to select best clapperboards..")

    # Create directory to store results in
    os.makedirs(output_dir, exist_ok=True)

    log.info("Running Clapperboard Selection Step")
    # Build dataframe
    image_obj = []
    for file in os.listdir(input_dir):
        if file.endswith(".jpeg"):
            image = file
            timestamp = int(Path(file).stem.split("=")[-1])
            image_obj.append(dict(image=image, timestamp=timestamp))

    output_file = os.path.join(output_dir, "selected_clapperboards.csv")

    image_df = pd.DataFrame(image_obj)
    image_df = image_df.sort_values(by=["timestamp"])
    image_df["image"] = image_df["image"].apply(
            lambda x: join(input_dir, x)
        )

    # feed  video dataframe into the function to select clapperboards
    _ = get_best_clapperboard(
        image_df=image_df,
        ocr_credentials=ocr_credentials,
        output_file=output_file,
        tolerance=timestamp_interval,
        stop_words=stop_words,
    )

    log.info("Finished Running Clapperboard Selection Step")
Beispiel #11
0
def init():
    global current_run
    current_run = Run.get_context()
Beispiel #12
0
def main():
    num_classes = 3

    # create checkpoint dir
    out_dir = './outputs' if args.out_dir is None else args.out_dir
    checkpoint_dir = os.path.join(out_dir, experiment_name, 'checkpoints')
    os.makedirs(checkpoint_dir, exist_ok=True)

    # write logs to ./logs, which AML uploads to Artifact Service and makes available to a TensorBoard instance.
    # also log some metrics through AML's Run object
    run = Run.get_context()
    logger_train = Logger('train', './logs', run)
    logger_val = Logger('val', './logs', run)
    log_sample_img_gt(sample_images_train, sample_images_val, logger_train,
                      logger_val)
    logging.info('Logged ground truth image samples')

    # larger model
    if model_choice == 'unet':
        model = Unet(feature_scale=feature_scale,
                     n_classes=num_classes,
                     is_deconv=True,
                     in_channels=3,
                     is_batchnorm=True)
    # year 2 best solution XD_XD's model, as the baseline model
    elif model_choice == 'unet_baseline':
        model = UnetBaseline(feature_scale=feature_scale,
                             n_classes=num_classes,
                             is_deconv=True,
                             in_channels=3,
                             is_batchnorm=True)
    else:
        sys.exit(
            'Invalid model_choice {}, choose unet_baseline or unet'.format(
                model_choice))

    model = model.to(device=device,
                     dtype=dtype)  # move the model parameters to CPU/GPU

    criterion = nn.CrossEntropyLoss(weight=loss_weights).to(device=device,
                                                            dtype=dtype)

    # can also use Nesterov momentum in optim.SGD
    # optimizer = optim.SGD(model.parameters(), lr=learning_rate,
    #                     momentum=0.9, nesterov=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # resume from a checkpoint if provided
    starting_epoch = 0
    best_acc = 0.0

    if os.path.isfile(starting_checkpoint_path):
        logging.info(
            'Loading checkpoint from {0}'.format(starting_checkpoint_path))
        checkpoint = torch.load(starting_checkpoint_path)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        starting_epoch = checkpoint['epoch']
        best_acc = checkpoint.get('best_acc', 0.0)
    else:
        logging.info(
            'No valid checkpoint is provided. Start to train from scratch...')
        model.apply(weights_init)

    if evaluate_only:
        val_loss, val_acc = evaluate(loader_val, model, criterion)
        print('Evaluated on val set, loss is {}, accuracy is {}'.format(
            val_loss, val_acc))
        return

    step = starting_epoch * len(dset_train)

    for epoch in range(starting_epoch, num_epochs):
        logging.info('Epoch {} of {}'.format(epoch, num_epochs))

        # train for one epoch
        step = train(loader_train, model, criterion, optimizer, epoch, step,
                     logger_train)

        # evaluate on val set
        logging.info(
            'Evaluating model on the val set at the end of epoch {}...'.format(
                epoch))
        val_loss, val_acc = evaluate(loader_val, model, criterion)
        logging.info('\nEpoch {}, val loss is {}, val accuracy is {}\n'.format(
            epoch, step, val_loss, val_acc))
        logger_val.scalar_summary('val_loss', val_loss, step + 1)
        logger_val.scalar_summary('val_acc', val_acc, step + 1)
        # TODO log the val images too

        # record the best accuracy; save checkpoint for every epoch
        is_best = val_acc > best_acc
        best_acc = max(val_acc, best_acc)

        checkpoint_path = os.path.join(
            checkpoint_dir, 'checkpoint_epoch{}_{}.pth.tar'.format(
                epoch, strftime("%Y-%m-%d-%H-%M-%S", localtime())))
        logging.info(
            'Saving to checkoutpoint file at {}. Is it the highest accuracy checkpoint so far: {}'
            .format(checkpoint_path, str(is_best)))
        save_checkpoint(
            {
                'epoch':
                epoch + 1,  # saved checkpoints are numbered starting from 1
                'arch': model_choice,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'best_acc': best_acc
            },
            is_best,
            checkpoint_path,
            checkpoint_dir)
def main():

    # Prepare parser for parameters to tune
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--learning_rate',
        type=float,
        default=0.1,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument(
        '--n_estimators',
        type=int,
        default=20,
        help=
        "Maximum number of iterations to converge, similar to max_iter in Logistic Regression"
    )
    parser.add_argument(
        '--max_features',
        type=int,
        default=5,
        help=
        "Number of features to consider in one pass, i.e. how large could the tree grow"
    )
    parser.add_argument(
        '--max_depth',
        type=int,
        default=2,
        help="Maximum number of splits, i.e. how bushy could the tree grow")
    args = parser.parse_args()

    # Prepare the dataset to match the expected format
    path = 'https://raw.githubusercontent.com/allaccountstaken/automl_v_hyperdrive/main/data/camel_data_after2010Q3.csv'
    ds = load_data(path)
    X, y = clean_data(ds)
    #Consider for internal datasets:
    #from azureml.data.dataset_factory import TabularDatasetFactory
    #ds = TabularDatasetFactory.from_delimited_files(path)

    # Perorm train-test-split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y.ravel(),
                                                        train_size=0.7,
                                                        random_state=123)

    # Additionally consider scalling (this may not be important for tree-based models)
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)

    # Prepare Azure run context to log tunning progress
    run = Run.get_context()
    run.log("Learning rate:", np.float(args.learning_rate))
    run.log("Number of estimators:", np.int(args.n_estimators))
    run.log("Number of features:", np.int(args.max_features))
    run.log("Max tree depth:", np.int(args.max_depth))

    # Instanciated and fit GBM classifier using sklearn library
    model = GradientBoostingClassifier(learning_rate=args.learning_rate,
                                       n_estimators=args.n_estimators,
                                       max_features=args.max_features,
                                       max_depth=args.max_depth,
                                       random_state=123)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    # Report performance metrics of the trained model using testing subset
    recall = recall_score(y_test, y_pred, average='binary')
    run.log("Recall", np.round(np.float(recall), 5))