Beispiel #1
0
from config import CONFIG
#from config import CONFIG_DEV as CONFIG #  Only for development.
from constants import REPO_DIR

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter(
        '%(asctime)s - %(levelname)s - %(message)s - %(pathname)s: line %(lineno)d'
    ))
logger.addHandler(handler)

# Get the current run.
run = Run.get_context()
offline_run = run.id.startswith("OfflineRun")

if offline_run:
    utils_dir_path = REPO_DIR / "cgmml/common/model_utils"
    utils_paths = glob.glob(os.path.join(utils_dir_path, "*.py"))
    temp_model_util_dir = Path(__file__).parent / "tmp_model_util"

    # Remove old temp_path
    if os.path.exists(temp_model_util_dir):
        shutil.rmtree(temp_model_util_dir)

    # Copy
    os.mkdir(temp_model_util_dir)
    os.system(f'touch {temp_model_util_dir}/__init__.py')
    for p in utils_paths:
Beispiel #2
0
def main():
    print("Running train.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--build_id",
        type=str,
        help="The build ID of the build triggering this pipeline run",
    )
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="sklearn_regression_model.pkl",
    )

    parser.add_argument(
        "--step_output",
        type=str,
        help=("output for passing data to next step")
    )

    args = parser.parse_args()

    print("Argument [build_id]: %s" % args.build_id)
    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)

    model_name = args.model_name
    build_id = args.build_id
    step_output_path = args.step_output

    print("Getting training parameters")

    alpha = 0.5

    print("Parameter alpha: %s" % alpha)

    run = Run.get_context()

    # Get the dataset
    dataset = run.input_datasets['training_data']
    if (dataset):
        df = dataset.to_pandas_dataframe()
        X = df.values
        y = df.Y
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    data = {"train": {"X": X_train, "y": y_train},
            "test": {"X": X_test, "y": y_test}}

    reg = train_model(run, data, alpha)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    joblib.dump(value=reg, filename=model_output_path)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=reg, filename=output_path)

    # Add properties to identify this specific training run
    run.parent.tag("BuildId", value=build_id)
    run.tag("BuildId", value=build_id)
    run.tag("run_type", value="train")
    builduri_base = os.environ.get("BUILDURI_BASE")
    if (builduri_base is not None):
        build_uri = builduri_base + build_id
        run.tag("BuildUri", value=build_uri)
        run.parent.tag("BuildUri", value=build_uri)
    print(f"tags now present for run: {run.tags}")

    run.complete()
    def _get_data_from_dataprep(dataprep_json, automl_settings_obj, logger):
        current_run = Run.get_submitted_run()
        parent_run_id = _get_parent_run_id(current_run._run_id)
        print("[ParentRunId:{}]: Start getting data using dataprep.".format(
            parent_run_id))
        logger.info(
            "[ParentRunId:{}]: Start getting data using dataprep.".format(
                parent_run_id))
        try:
            import azureml.train.automl._dataprep_utilities as dataprep_utilities
        except Exception as e:
            e.error_type = ErrorTypes.Unclassified
            log_traceback(e, logger)
            logger.error(e)
            raise e

        fit_iteration_parameters_dict = dict()

        class RetrieveNumpyArrayError(Exception):
            def __init__(self):
                super().__init__()

        try:
            print("Resolving Dataflows...")
            logger.info("Resolving Dataflows...")
            dataprep_json_obj = json.loads(dataprep_json)
            if 'activities' in dataprep_json_obj:  # json is serialized dataflows
                dataflow_dict = dataprep_utilities.load_dataflows_from_json(
                    dataprep_json)
                for k in [
                        'X', 'X_valid', 'sample_weight', 'sample_weight_valid'
                ]:
                    fit_iteration_parameters_dict[
                        k] = dataprep_utilities.try_retrieve_pandas_dataframe(
                            dataflow_dict.get(k))
                for k in ['y', 'y_valid']:
                    try:
                        fit_iteration_parameters_dict[
                            k] = dataprep_utilities.try_retrieve_numpy_array(
                                dataflow_dict.get(k))
                    except IndexError:
                        raise RetrieveNumpyArrayError()

                cv_splits_dataflows = []
                i = 0
                while 'cv_splits_indices_{0}'.format(i) in dataflow_dict:
                    cv_splits_dataflows.append(
                        dataflow_dict['cv_splits_indices_{0}'.format(i)])
                    i = i + 1
                fit_iteration_parameters_dict['cv_splits_indices'] = None if len(cv_splits_dataflows) == 0 \
                    else dataprep_utilities.try_resolve_cv_splits_indices(cv_splits_dataflows)
            else:  # json is dataprep options
                print('Creating Dataflow from options...\r\nOptions:')
                logger.info('Creating Dataflow from options...')
                print(dataprep_json_obj)
                datastore_name = dataprep_json_obj[
                    'datastoreName']  # mandatory
                data_path = dataprep_json_obj['dataPath']  # mandatory
                label_column = dataprep_json_obj['label']  # mandatory
                separator = dataprep_json_obj.get('columnSeparator', ',')
                header = dataprep_json_obj.get('promoteHeader', True)
                encoding = dataprep_json_obj.get('encoding', None)
                quoting = dataprep_json_obj.get('ignoreNewlineInQuotes', False)
                skip_rows = dataprep_json_obj.get('skipRows', 0)
                feature_columns = dataprep_json_obj.get('features', [])

                from azureml.core import Datastore
                import azureml.dataprep as dprep
                if header:
                    header = dprep.PromoteHeadersMode.CONSTANTGROUPED
                else:
                    header = dprep.PromoteHeadersMode.NONE
                try:
                    encoding = dprep.FileEncoding[encoding]
                except:
                    encoding = dprep.FileEncoding.UTF8

                ws = Run.get_context().experiment.workspace
                datastore = Datastore(ws, datastore_name)
                dflow = dprep.read_csv(path=datastore.path(data_path),
                                       separator=separator,
                                       header=header,
                                       encoding=encoding,
                                       quoting=quoting,
                                       skip_rows=skip_rows)

                if len(feature_columns) == 0:
                    X = dflow.drop_columns(label_column)
                else:
                    X = dflow.keep_columns(feature_columns)

                print('Inferring types for feature columns...')
                logger.info('Inferring types for feature columns...')
                sct = X.builders.set_column_types()
                sct.learn()
                sct.ambiguous_date_conversions_drop()
                X = sct.to_dataflow()

                y = dflow.keep_columns(label_column)
                if automl_settings_obj.task_type.lower() == 'regression':
                    y = y.to_number(label_column)

                print('X:')
                print(X)
                logger.info('X:')
                logger.info(X)

                print('y:')
                print(y)
                logger.info('y:')
                logger.info(y)

                try:
                    from azureml.train.automl._dataprep_utilities import try_retrieve_pandas_dataframe_adb
                    _X = try_retrieve_pandas_dataframe_adb(X)
                    fit_iteration_parameters_dict['X'] = _X.values
                    fit_iteration_parameters_dict[
                        'x_raw_column_names'] = _X.columns.values
                except ImportError:
                    logger.info(
                        "SDK version does not support column names extraction, fallback to old path"
                    )
                    fit_iteration_parameters_dict[
                        'X'] = dataprep_utilities.try_retrieve_pandas_dataframe(
                            X)

                try:
                    fit_iteration_parameters_dict[
                        'y'] = dataprep_utilities.try_retrieve_numpy_array(y)
                except IndexError:
                    raise RetrieveNumpyArrayError()

            logger.info("Finish getting data using dataprep.")
            return fit_iteration_parameters_dict
        except Exception as e:
            print("[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".
                  format(parent_run_id, e.__class__, e))
            logger.error(
                "[ParentRunId:{0}]: Error from resolving Dataflows: {1} {2}".
                format(parent_run_id, e.__class__, e))
            if isinstance(e, RetrieveNumpyArrayError):
                logger.debug("Label column (y) does not exist in user's data.")
                e.error_type = ErrorTypes.User
            elif "The provided path is not valid." in str(e):
                logger.debug("User's data is not accessible from remote run.")
                e.error_type = ErrorTypes.User
            elif "Required secrets are missing. Please call use_secrets to register the missing secrets." in str(
                    e):
                logger.debug(
                    "User should use Datastore to data that requires secrets.")
                e.error_type = ErrorTypes.User
            else:
                e.error_type = ErrorTypes.Client
            log_traceback(e, logger)
            raise RuntimeError("Error during extracting Dataflows")
Beispiel #4
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="porto_seguro_safe_driver_model.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"),
                        default="outputs")

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"),
                        default=1)

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name,
                                          dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['train_dataset'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    train_df = dataset.to_pandas_dataframe()
    train_data, valid_data = split_data(train_df)

    # Train the model
    model = train_model(train_data, valid_data, train_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, train_data, valid_data)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)
    joblib.dump(value=model, filename=model_output_path)

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Beispiel #5
0
def main():
    num_classes = 3

    # create checkpoint dir
    out_dir = './outputs' if args.out_dir is None else args.out_dir
    checkpoint_dir = os.path.join(out_dir, experiment_name, 'checkpoints')
    os.makedirs(checkpoint_dir, exist_ok=True)

    # write logs to ./logs, which AML uploads to Artifact Service and makes available to a TensorBoard instance.
    # also log some metrics through AML's Run object
    run = Run.get_context()
    logger_train = Logger('train', './logs', run)
    logger_val = Logger('val', './logs', run)
    log_sample_img_gt(sample_images_train, sample_images_val, logger_train, logger_val)
    logging.info('Logged ground truth image samples')

    # larger model
    if model_choice == 'unet':
        model = Unet(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True,
                     in_channels=3, is_batchnorm=True)
    # year 2 best solution XD_XD's model, as the baseline model
    elif model_choice == 'unet_baseline':
        model = UnetBaseline(feature_scale=feature_scale, n_classes=num_classes, is_deconv=True,
                             in_channels=3, is_batchnorm=True)
    else:
        sys.exit('Invalid model_choice {}, choose unet_baseline or unet'.format(model_choice))

    model = model.to(device=device, dtype=dtype)  # move the model parameters to CPU/GPU

    criterion = nn.CrossEntropyLoss(weight=loss_weights).to(device=device, dtype=dtype)

    # can also use Nesterov momentum in optim.SGD
    # optimizer = optim.SGD(model.parameters(), lr=learning_rate,
    #                     momentum=0.9, nesterov=True)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # resume from a checkpoint if provided
    starting_epoch = 0
    best_acc = 0.0

    if os.path.isfile(starting_checkpoint_path):
        logging.info('Loading checkpoint from {0}'.format(starting_checkpoint_path))
        checkpoint = torch.load(starting_checkpoint_path)
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        starting_epoch = checkpoint['epoch']
        best_acc = checkpoint.get('best_acc', 0.0)
    else:
        logging.info('No valid checkpoint is provided. Start to train from scratch...')
        model.apply(weights_init)

    if evaluate_only:
        val_loss, val_acc = evaluate(loader_val, model, criterion)
        print('Evaluated on val set, loss is {}, accuracy is {}'.format(val_loss, val_acc))
        return

    step = starting_epoch * len(dset_train)

    for epoch in range(starting_epoch, num_epochs):
        logging.info('Epoch {} of {}'.format(epoch, num_epochs))

        # train for one epoch
        step = train(loader_train, model, criterion, optimizer, epoch, step, logger_train)

        # evaluate on val set
        logging.info('Evaluating model on the val set at the end of epoch {}...'.format(epoch))
        val_loss, val_acc = evaluate(loader_val, model, criterion)
        logging.info('\nEpoch {}, val loss is {}, val accuracy is {}\n'.format(epoch, step, val_loss, val_acc))
        logger_val.scalar_summary('val_loss', val_loss, step + 1)
        logger_val.scalar_summary('val_acc', val_acc, step + 1)
        # TODO log the val images too

        # record the best accuracy; save checkpoint for every epoch
        is_best = val_acc > best_acc
        best_acc = max(val_acc, best_acc)

        checkpoint_path = os.path.join(checkpoint_dir,
                                       'checkpoint_epoch{}_{}.pth.tar'.format(epoch, strftime("%Y-%m-%d-%H-%M-%S", localtime())))
        logging.info(
            'Saving to checkoutpoint file at {}. Is it the highest accuracy checkpoint so far: {}'.format(
                checkpoint_path, str(is_best)))
        save_checkpoint({
            'epoch': epoch + 1,  # saved checkpoints are numbered starting from 1
            'arch': model_choice,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'best_acc': best_acc
        }, is_best, checkpoint_path, checkpoint_dir)
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="COVID19Articles_model_github.pkl",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    datastore_name = os.environ.get("DATASTORE_NAME")
    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    # Get the dataset
    if (dataset_name):
        if (data_file_path == ""):
            if (dataset_name
                    in Dataset.get_all(run.experiment.workspace).keys()):
                dataset = Dataset.get_by_name(run.experiment.workspace,
                                              dataset_name,
                                              version=dataset_version)
            else:
                create_sample_data_csv(run.experiment.workspace,
                                       datastore_name)
                dataset = register_dataset(run.experiment.workspace,
                                           dataset_name, datastore_name)
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       datastore_name, data_file_path)
    else:
        if (data_file_path == ""):
            data_file_path = "COVID19Articles.csv"
            create_sample_data_csv(run.experiment.workspace, datastore_name)
        dataset_name = "COVID19Articles_Training_githubactions"
        dataset = register_dataset(run.experiment.workspace, dataset_name,
                                   datastore_name, data_file_path)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    data = split_data(df)

    class_args = {"max_depth": 5}
    # Train the model
    model = train_model(data, class_args)

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, data)
    for (k, v) in metrics.items():
        run.log(k, v)

    # files saved in the "outputs" folder are automatically uploaded into run history
    model_file_name = "COVID19Articles_model.pkl"
    joblib.dump(model, os.path.join('outputs', model_file_name))
    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Beispiel #7
0
def run_train_from_args(
        args,
        hyperdrive_hyperparameter_overrides: Dict[str, str] = {}) -> None:
    # Get the housekeeping going and start logging:
    os.makedirs(args.save_dir, exist_ok=True)
    run_id = make_run_id(args.model, args.task)
    log_file = os.path.join(args.save_dir, f"{run_id}.log")

    def log(msg):
        log_line(log_file, msg)

    log(f"Setting random seed {args.random_seed}.")
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    tf.random.set_seed(args.random_seed)

    data_path = RichPath.create(args.data_path, args.azure_info)
    #second path
    data_path_2 = RichPath.create(
        os.path.split(args.data_path)[0] + '/test2', args.azure_info)

    data_path_3 = RichPath.create(
        os.path.split(args.data_path)[0] + '/test3', args.azure_info)

    ##new_inputs

    try:
        dataset, model = get_model_and_dataset(
            msg_passing_implementation=args.model,
            task_name=args.task,
            data_path=data_path,
            trained_model_file=args.load_saved_model,
            cli_data_hyperparameter_overrides=args.data_param_override,
            cli_model_hyperparameter_overrides=args.model_param_override,
            hyperdrive_hyperparameter_overrides=
            hyperdrive_hyperparameter_overrides,
            folds_to_load={DataFold.TRAIN, DataFold.VALIDATION},
            load_weights_only=args.load_weights_only,
            case_name=args.case,  # add by zjq
        )
        #second
        dataset2, model_2 = get_model_and_dataset(
            msg_passing_implementation=args.model,
            task_name=args.task,
            data_path=data_path_2,
            trained_model_file=args.load_saved_model,
            cli_data_hyperparameter_overrides=args.data_param_override,
            cli_model_hyperparameter_overrides=args.model_param_override,
            hyperdrive_hyperparameter_overrides=
            hyperdrive_hyperparameter_overrides,
            folds_to_load={DataFold.TRAIN, DataFold.VALIDATION},
            load_weights_only=args.load_weights_only,
            case_name=args.case,  # add by zjq
        )

        ##new_inputs
        dataset3, model_3 = get_model_and_dataset(
            msg_passing_implementation=args.model,
            task_name=args.task,
            data_path=data_path_3,
            trained_model_file=args.load_saved_model,
            cli_data_hyperparameter_overrides=args.data_param_override,
            cli_model_hyperparameter_overrides=args.model_param_override,
            hyperdrive_hyperparameter_overrides=
            hyperdrive_hyperparameter_overrides,
            folds_to_load={DataFold.TRAIN, DataFold.VALIDATION},
            load_weights_only=args.load_weights_only,
            case_name=args.case,  # add by zjq
        )
    except ValueError as err:
        print(err.args)

    log(f"Dataset parameters: {json.dumps(unwrap_tf_tracked_data(dataset._params))}"
        )
    log(f"Model parameters: {json.dumps(unwrap_tf_tracked_data(model._params))}"
        )

    if args.azureml_logging:
        from azureml.core.run import Run

        aml_run = Run.get_context()
    else:
        aml_run = None
    if not args.load_trained_model:
        trained_model_path = train(
            model,
            dataset,
            dataset2,
            dataset3,
            log_fun=log,
            run_id=run_id,
            max_epochs=args.max_epochs,
            patience=args.patience,
            save_dir=args.save_dir,
            quiet=args.quiet,
            aml_run=aml_run,
        )
    else:
        trained_model_path = args.load_trained_model

    #new_inputs
    if args.run_test:
        data_path = RichPath.create(args.data_path, args.azure_info)
        data_path_2 = RichPath.create(
            os.path.split(args.data_path)[0] + '/test2', args.azure_info)
        data_path_3 = RichPath.create(
            os.path.split(args.data_path)[0] + '/test3', args.azure_info)
        log("== Running on test dataset")
        log(f"Loading data from {data_path}.")
        dataset.load_data(data_path, {DataFold.TEST})
        dataset2.load_data(data_path_2, {DataFold.TEST})
        dataset3.load_data(data_path_3, {DataFold.TEST})
        log(f"Restoring best model state from {trained_model_path}.")
        load_weights_verbosely(trained_model_path, model)
        test_data_1 = dataset.get_tensorflow_dataset(DataFold.TEST)
        test_data_2 = dataset2.get_tensorflow_dataset(DataFold.TEST)
        test_data_3 = dataset3.get_tensorflow_dataset(DataFold.TEST)
        _, _, test_results = model.run_one_epoch_new(test_data_1,
                                                     test_data_2,
                                                     test_data_3,
                                                     training=False,
                                                     quiet=args.quiet)
        test_metric, test_metric_string = model.compute_epoch_metrics(
            test_results)
        log(test_metric_string)
        nni.report_final_result(float(test_metric_string.split(" ")[-1]))
        if aml_run is not None:
            aml_run.log("task_test_metric", float(test_metric))
def main(unused_argv):
    data_root = os.path.join("outputs", "MNIST")
    mnist = None
    tf_config = os.environ.get("TF_CONFIG")
    if not tf_config or tf_config == "":
        raise ValueError("TF_CONFIG not found.")
    tf_config_json = json.loads(tf_config)
    cluster = tf_config_json.get('cluster')
    job_name = tf_config_json.get('task', {}).get('type')
    task_index = tf_config_json.get('task', {}).get('index')
    job_name = "worker" if job_name == "master" else job_name
    sentinel_path = os.path.join(data_root, "complete.txt")
    if job_name == "worker" and task_index == 0:
        mnist = input_data.read_data_sets(data_root, one_hot=True)
        with open(sentinel_path, 'w+') as f:
            f.write("download complete")
    else:
        while not os.path.exists(sentinel_path):
            time.sleep(0.01)
        mnist = input_data.read_data_sets(data_root, one_hot=True)

    if FLAGS.download_only:
        sys.exit(0)

    print("job name = %s" % job_name)
    print("task index = %d" % task_index)
    print("number of GPUs = %d" % FLAGS.num_gpus)

    # Construct the cluster and start the server
    cluster_spec = tf.train.ClusterSpec(cluster)

    # Get the number of workers.
    num_workers = len(cluster_spec.task_indices("worker"))

    if not FLAGS.existing_servers:
        # Not using existing servers. Create an in-process server.
        server = tf.train.Server(
            cluster_spec, job_name=job_name, task_index=task_index)
        if job_name == "ps":
            server.join()

    is_chief = (task_index == 0)
    if FLAGS.num_gpus > 0:
        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
        # for each worker in the corresponding machine
        gpu = (task_index % FLAGS.num_gpus)
        worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu)
    elif FLAGS.num_gpus == 0:
        # Just allocate the CPU to worker server
        cpu = 0
        worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu)
    # The device setter will automatically place Variables ops on separate
    # parameter servers (ps). The non-Variable ops will be placed on the workers.
    # The ps use CPU and workers use corresponding GPU
    with tf.device(
        tf.train.replica_device_setter(
            worker_device=worker_device,
            ps_device="/job:ps/cpu:0",
            cluster=cluster)):
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Variables of the hidden layer
        hid_w = tf.Variable(
            tf.truncated_normal(
                [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
                stddev=1.0 / IMAGE_PIXELS),
            name="hid_w")
        hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")

        # Variables of the softmax layer
        sm_w = tf.Variable(
            tf.truncated_normal(
                [FLAGS.hidden_units, 10],
                stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
            name="sm_w")
        sm_b = tf.Variable(tf.zeros([10]), name="sm_b")

        # Ops: located on the worker specified with task_index
        x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
        y_ = tf.placeholder(tf.float32, [None, 10])

        hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
        hid = tf.nn.relu(hid_lin)

        y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
        cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

        opt = tf.train.AdamOptimizer(FLAGS.learning_rate)

        if FLAGS.sync_replicas:
            if FLAGS.replicas_to_aggregate is None:
                replicas_to_aggregate = num_workers
            else:
                replicas_to_aggregate = FLAGS.replicas_to_aggregate

            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=replicas_to_aggregate,
                total_num_replicas=num_workers,
                name="mnist_sync_replicas")

        train_step = opt.minimize(cross_entropy, global_step=global_step)

        if FLAGS.sync_replicas:
            local_init_op = opt.local_step_init_op
            if is_chief:
                local_init_op = opt.chief_init_op

            ready_for_local_init_op = opt.ready_for_local_init_op

            # Initial token and chief queue runners required by the sync_replicas mode
            chief_queue_runner = opt.get_chief_queue_runner()
            sync_init_op = opt.get_init_tokens_op()

        init_op = tf.global_variables_initializer()
        train_dir = tempfile.mkdtemp()

        if FLAGS.sync_replicas:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                ready_for_local_init_op=ready_for_local_init_op,
                recovery_wait_secs=1,
                global_step=global_step)
        else:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                recovery_wait_secs=1,
                global_step=global_step)

        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=["/job:ps",
                            "/job:worker/task:%d" % task_index])

        # The chief worker (task_index==0) session will prepare the session,
        # while the remaining workers will wait for the preparation to complete.
        if is_chief:
            print("Worker %d: Initializing session..." % task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  task_index)

        if FLAGS.existing_servers:
            server_grpc_url = "grpc://" + task_index
            print("Using existing server at: %s" % server_grpc_url)

            sess = sv.prepare_or_wait_for_session(server_grpc_url, config=sess_config)
        else:
            sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

        print("Worker %d: Session initialization complete." % task_index)

        if FLAGS.sync_replicas and is_chief:
            # Chief worker will start the chief queue runner and call the init op.
            sess.run(sync_init_op)
            sv.start_queue_runners(sess, [chief_queue_runner])

        # Perform training
        time_begin = time.time()
        print("Training begins @ %f" % time_begin)

        local_step = 0
        while True:
            # Training feed
            batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
            train_feed = {x: batch_xs, y_: batch_ys}

            _, step = sess.run([train_step, global_step], feed_dict=train_feed)
            local_step += 1

            now = time.time()
            print("%f: Worker %d: training step %d done (global step: %d)" %
                  (now, task_index, local_step, step))

            if step >= FLAGS.train_steps:
                break

        time_end = time.time()
        print("Training ends @ %f" % time_end)
        training_time = time_end - time_begin
        print("Training elapsed time: %f s" % training_time)

        # Validation feed
        val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
        val_xent = sess.run(cross_entropy, feed_dict=val_feed)
        print("After %d training step(s), validation cross entropy = %g" %
              (FLAGS.train_steps, val_xent))
        if job_name == "worker" and task_index == 0:
            run = Run.get_context()
            run.log("CrossEntropy", val_xent)
Beispiel #9
0
def main(unused_argv):
    data_root = os.path.join("outputs", "MNIST")
    mnist = None
    tf_config = os.environ.get("TF_CONFIG")
    if not tf_config or tf_config == "":
        raise ValueError("TF_CONFIG not found.")
    tf_config_json = json.loads(tf_config)
    cluster = tf_config_json.get('cluster')
    job_name = tf_config_json.get('task', {}).get('type')
    task_index = tf_config_json.get('task', {}).get('index')
    job_name = "worker" if job_name == "master" else job_name
    sentinel_path = os.path.join(data_root, "complete.txt")
    if job_name == "worker" and task_index == 0:
        mnist = input_data.read_data_sets(data_root, one_hot=True)
        with open(sentinel_path, 'w+') as f:
            f.write("download complete")
    else:
        while not os.path.exists(sentinel_path):
            time.sleep(0.01)
        mnist = input_data.read_data_sets(data_root, one_hot=True)

    if FLAGS.download_only:
        sys.exit(0)

    print("job name = %s" % job_name)
    print("task index = %d" % task_index)
    print("number of GPUs = %d" % FLAGS.num_gpus)

    # Construct the cluster and start the server
    cluster_spec = tf.train.ClusterSpec(cluster)

    # Get the number of workers.
    num_workers = len(cluster_spec.task_indices("worker"))

    if not FLAGS.existing_servers:
        # Not using existing servers. Create an in-process server.
        server = tf.train.Server(cluster_spec,
                                 job_name=job_name,
                                 task_index=task_index)
        if job_name == "ps":
            server.join()

    is_chief = (task_index == 0)
    if FLAGS.num_gpus > 0:
        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
        # for each worker in the corresponding machine
        gpu = (task_index % FLAGS.num_gpus)
        worker_device = "/job:worker/task:%d/gpu:%d" % (task_index, gpu)
    elif FLAGS.num_gpus == 0:
        # Just allocate the CPU to worker server
        cpu = 0
        worker_device = "/job:worker/task:%d/cpu:%d" % (task_index, cpu)
    # The device setter will automatically place Variables ops on separate
    # parameter servers (ps). The non-Variable ops will be placed on the workers.
    # The ps use CPU and workers use corresponding GPU
    with tf.device(
            tf.train.replica_device_setter(worker_device=worker_device,
                                           ps_device="/job:ps/cpu:0",
                                           cluster=cluster)):
        global_step = tf.Variable(0, name="global_step", trainable=False)

        # Variables of the hidden layer
        hid_w = tf.Variable(tf.truncated_normal(
            [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
            stddev=1.0 / IMAGE_PIXELS),
                            name="hid_w")
        hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")

        # Variables of the softmax layer
        sm_w = tf.Variable(tf.truncated_normal([FLAGS.hidden_units, 10],
                                               stddev=1.0 /
                                               math.sqrt(FLAGS.hidden_units)),
                           name="sm_w")
        sm_b = tf.Variable(tf.zeros([10]), name="sm_b")

        # Ops: located on the worker specified with task_index
        x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
        y_ = tf.placeholder(tf.float32, [None, 10])

        hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
        hid = tf.nn.relu(hid_lin)

        y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
        cross_entropy = -tf.reduce_sum(
            y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

        opt = tf.train.AdamOptimizer(FLAGS.learning_rate)

        if FLAGS.sync_replicas:
            if FLAGS.replicas_to_aggregate is None:
                replicas_to_aggregate = num_workers
            else:
                replicas_to_aggregate = FLAGS.replicas_to_aggregate

            opt = tf.train.SyncReplicasOptimizer(
                opt,
                replicas_to_aggregate=replicas_to_aggregate,
                total_num_replicas=num_workers,
                name="mnist_sync_replicas")

        train_step = opt.minimize(cross_entropy, global_step=global_step)

        if FLAGS.sync_replicas:
            local_init_op = opt.local_step_init_op
            if is_chief:
                local_init_op = opt.chief_init_op

            ready_for_local_init_op = opt.ready_for_local_init_op

            # Initial token and chief queue runners required by the sync_replicas mode
            chief_queue_runner = opt.get_chief_queue_runner()
            sync_init_op = opt.get_init_tokens_op()

        init_op = tf.global_variables_initializer()
        train_dir = tempfile.mkdtemp()

        if FLAGS.sync_replicas:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                ready_for_local_init_op=ready_for_local_init_op,
                recovery_wait_secs=1,
                global_step=global_step)
        else:
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=train_dir,
                                     init_op=init_op,
                                     recovery_wait_secs=1,
                                     global_step=global_step)

        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=False,
            device_filters=["/job:ps",
                            "/job:worker/task:%d" % task_index])

        # The chief worker (task_index==0) session will prepare the session,
        # while the remaining workers will wait for the preparation to complete.
        if is_chief:
            print("Worker %d: Initializing session..." % task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  task_index)

        if FLAGS.existing_servers:
            server_grpc_url = "grpc://" + task_index
            print("Using existing server at: %s" % server_grpc_url)

            sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                                  config=sess_config)
        else:
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)

        print("Worker %d: Session initialization complete." % task_index)

        if FLAGS.sync_replicas and is_chief:
            # Chief worker will start the chief queue runner and call the init op.
            sess.run(sync_init_op)
            sv.start_queue_runners(sess, [chief_queue_runner])

        # Perform training
        time_begin = time.time()
        print("Training begins @ %f" % time_begin)

        local_step = 0
        while True:
            # Training feed
            batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
            train_feed = {x: batch_xs, y_: batch_ys}

            _, step = sess.run([train_step, global_step], feed_dict=train_feed)
            local_step += 1

            now = time.time()
            print("%f: Worker %d: training step %d done (global step: %d)" %
                  (now, task_index, local_step, step))

            if step >= FLAGS.train_steps:
                break

        time_end = time.time()
        print("Training ends @ %f" % time_end)
        training_time = time_end - time_begin
        print("Training elapsed time: %f s" % training_time)

        # Validation feed
        val_feed = {x: mnist.validation.images, y_: mnist.validation.labels}
        val_xent = sess.run(cross_entropy, feed_dict=val_feed)
        print("After %d training step(s), validation cross entropy = %g" %
              (FLAGS.train_steps, val_xent))
        if job_name == "worker" and task_index == 0:
            run = Run.get_context()
            run.log("CrossEntropy", val_xent)
def main(root_dir: str, model_info_dir: str, val_dir: str, output_dir: str,
         labels: str) -> None:
    """
    Main function for receiving args, and passing them through to form recognizer postprocessing function

    Parameters
    ----------
    root_dir: str
        Root datastore being used
    model_info_dir: str
        Directory containing trained custom model information
    val_dir: str
        Directory containing test images
    output_dir: str
        Path to save outputs to
    labels: str
        Labels or fields to extract results from
    """
    log.info("Evaluation step")

    # get context of current run
    run = Run.get_context()

    # set form recognizer credentials
    form_credentials = {
        "key": run.get_secret("formkey"),
        "endpoint": run.get_secret("formendpoint")
    }

    # process labels string to array
    labels = [label.strip() for label in labels.split(",")]

    model_info_dir = join(root_dir, model_info_dir)
    val_dir = join(root_dir, val_dir)
    output_dir = join(root_dir, output_dir)

    # read in model information
    log.info("Compile model information")
    model_fname = "model.json"
    with open(join(model_info_dir, model_fname), "r") as model_info_file:
        model_info = json.load(model_info_file)
        log.info(model_info)

    # Processing image files
    images = []
    for file in os.listdir(val_dir):
        images.append({"image": file})

    # convert array of dict objects to a pandas dataframe
    image_df = pd.DataFrame(images)
    # use lambda function to aply full path to each image file
    image_df["image"] = image_df["image"].apply(lambda x: join(val_dir, x))

    log.info("Evaluate Form Recognizer Model")
    detection_rates = get_detection_rates(form_credentials=form_credentials,
                                          model_id=model_info["modelId"],
                                          image_df=image_df,
                                          output_dir=output_dir,
                                          labels=labels)

    # Log metrics
    for metric_info in detection_rates:
        # extract and log detection rate for each object per video
        scene_rate = metric_info["scene_detection_rate"]
        take_rate = metric_info["take_detection_rate"]
        run.parent.log(name="scene_detection_rate", value=scene_rate)
        run.parent.log(name="take_detection_rate", value=take_rate)

    log.info("Finished model evaluation")
Beispiel #11
0
def main():
    print("Running train.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--build_id",
        type=str,
        help="The build ID of the build triggering this pipeline run",
    )
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="sklearn_regression_model.pkl",
    )

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset with the training data"))
    args = parser.parse_args()

    print("Argument [build_id]: %s" % args.build_id)
    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    build_id = args.build_id
    dataset_name = args.dataset_name

    print("Getting training parameters")

    with open("config.json") as f:
        pars = json.load(f)
    try:
        alpha = pars["training"]["alpha"]
    except KeyError:
        alpha = 0.5

    print("Parameter alpha: %s" % alpha)

    run = Run.get_context()
    ws = run.experiment.workspace

    if (dataset_name):
        dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
        df = dataset.to_pandas_dataframe()
        X = df.values
        y = df.Y
    else:
        X, y = load_diabetes(return_X_y=True)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)
    data = {
        "train": {
            "X": X_train,
            "y": y_train
        },
        "test": {
            "X": X_test,
            "y": y_test
        }
    }

    reg = train_model(run, data, alpha)

    joblib.dump(value=reg, filename=model_name)

    # upload model file explicitly into artifacts for parent run
    run.parent.upload_file(name="./outputs/" + model_name,
                           path_or_stream=model_name)
    print("Uploaded the model {} to experiment {}".format(
        model_name, run.experiment.name))
    dirpath = os.getcwd()
    print(dirpath)
    print("Following files are uploaded ")
    print(run.parent.get_file_names())

    run.parent.tag("BuildId", value=build_id)

    # Add properties to identify this specific training run
    run.tag("BuildId", value=build_id)
    run.tag("run_type", value="train")
    builduri_base = os.environ.get("BUILDURI_BASE")
    if (builduri_base is not None):
        build_uri = builduri_base + build_id
        run.tag("BuildUri", value=build_uri)
        run.parent.tag("BuildUri", value=build_uri)
    print(f"tags now present for run: {run.tags}")

    run.complete()
Beispiel #12
0
from __future__ import division
from __future__ import print_function

import sys
import os
import shutil
import argparse
import math

import tensorflow as tf

from azureml.core.run import Run  ##### Modified

# Get run when running in remote ##### Modified
if 'run' not in locals():  ##### Modified
    run = Run.get_context()  ##### Modified

FLAGS = None
batch_size = 100

#
# define functions for Estimator
#


def _my_input_fn(filepath, num_epochs):
    # image - 784 (=28 x 28) elements of grey-scaled integer value [0, 1]
    # label - digit (0, 1, ..., 9)
    data_queue = tf.train.string_input_producer(
        [filepath], num_epochs=num_epochs
    )  # data is repeated and it raises OutOfRange when data is over
Beispiel #13
0
def run(args):
    if args.supress_warnings:
        warnings.simplefilter("ignore")

    def adjust_path(p):
        return os.path.join(args.data_root_dir, p)

    args.label_encoder = adjust_path(args.label_encoder)
    args.all_imgs_csv = adjust_path(args.all_imgs_csv)
    args.val_imgs_csv = adjust_path(args.val_imgs_csv)
    args.test_imgs_csv = adjust_path(args.test_imgs_csv)
    args.results_dir = adjust_path(args.results_dir)

    print(args)

    from multihead_trainer import train
    from multihead_trainer import torch_transform

    # TODO: consolidate logid
    def build_logid_string(args, add_timestamp=True):
        param_str = "lr{}_dr{}_lrpatience{}_lrfactor{}_{}".format(
            args.init_lr, args.dropout, args.lr_patience, args.lr_factor,
            args.appearance_network)

        if add_timestamp:
            param_str += "_" + datetime.datetime.now().strftime("%Y%m%d%H%M")

        return param_str

    param_str = build_logid_string(args)

    # Azure ML
    from azureml.core.run import Run
    run = Run.get_context()

    # log arguments if it's not called by train_cv
    if not hasattr(args, 'folds_csv_dir'):
        for k, v in vars(args).items():
            run.tag(k, str(v))

    save_path = os.path.join(args.results_dir, param_str)
    os.makedirs(save_path, exist_ok=True)
    print("save_path", save_path)

    logger.info(
        f"cuda.is_available={torch.cuda.is_available()}, n_gpu={torch.cuda.device_count()}"
    )

    # encode the classes
    from sklearn.preprocessing import LabelEncoder

    import pickle
    if not os.path.exists(args.label_encoder):
        logger.warning(f"Fitting a new label encoder at {args.label_encoder}")

        all_imgs_df = pd.read_csv(args.all_imgs_csv)

        label_encoder = LabelEncoder()
        label_encoder.fit(all_imgs_df['label'])

        pickle.dump(label_encoder, open(args.label_encoder, "wb"))

    else:
        logger.info(f"Loading label encoder: {args.label_encoder}")

        with open(args.label_encoder, 'rb') as pickle_file:
            label_encoder = pickle.load(pickle_file)

    logger.info(f"label_encoder.classes_={label_encoder.classes_}")
    logger.info("The label encoder has {} classes.".format(
        len(label_encoder.classes_)))

    # Load image list
    all_images_df = pd.read_csv(args.all_imgs_csv)
    val_df = pd.read_csv(args.val_imgs_csv)
    test_df = pd.read_csv(args.test_imgs_csv)

    for df in [all_images_df, val_df, test_df]:
        df['image_path'] = df['image_path'].apply(
            lambda x: os.path.join(args.data_root_dir, args.img_dir, x))

    val_test_image_paths = list(val_df['image_path'].values) + list(
        test_df['image_path'].values)
    train_df = all_images_df[~all_images_df['image_path'].
                             isin(val_test_image_paths)]

    ref_only_df = train_df[train_df['is_ref']]
    cons_train_df = train_df[train_df['is_ref'] == False]
    cons_val_df = val_df

    print("all_images", len(all_images_df), "train", len(train_df), "val",
          len(val_df), "test", len(test_df))
    run.log("all_images_size", len(all_images_df))
    run.log("train_size", len(train_df))
    run.log("val_size", len(val_df))
    run.log("test_size", len(test_df))

    print("ref_only_df", len(ref_only_df), "cons_train_df", len(cons_train_df),
          "cons_val_df", len(cons_val_df))

    import classif_utils
    classif_utils.ClassificationDataset.set_datadir(
        os.path.join(args.data_root_dir, args.img_dir))

    def plot_pr_curve(plt, dataset_name):
        run.log_image(name='{}_{}_{}'.format(
            dataset_name,
            datetime.datetime.now().strftime("%H:%M:%S"), 'PR-curve'),
                      plot=plt)
        plt.close()

    def log_metrics(metrics_results, dataset_name):
        from metrics import create_prec_inds_str
        import matplotlib
        matplotlib.use('Agg')  #backend that doesn't display to the user
        import matplotlib.pyplot as plt
        import matplotlib.image as mpimg

        run_metrics = []

        for k, v in metrics_results.items():
            if ('p_indices' in k) and not ('sanity' in dataset_name):
                pind_str = create_prec_inds_str(v, label_encoder)

                run.log("{}_{}".format(dataset_name, k), pind_str)
                run_metrics.append([
                    os.path.split(args.val_imgs_csv)[1], dataset_name, k,
                    pind_str
                ])

            elif isinstance(v, (int, float)):
                run.log("{}_{}".format(dataset_name, k), v)
                run_metrics.append(
                    [os.path.split(args.val_imgs_csv)[1], dataset_name, k, v])

        return run_metrics

    #if da_train, models is actually a dictionary with F1, F2 and G
    model, val_metrics = train(ref_only_df,
                               cons_train_df,
                               cons_val_df,
                               label_encoder,
                               torch_transform,
                               'label',
                               args.batch_size,
                               len(label_encoder.classes_),
                               args,
                               args.max_epochs,
                               results_dir=save_path,
                               add_perspective=args.add_persp_aug)

    print('completed train()')
    print('val_metrics', val_metrics)

    run_metrics_list = log_metrics(val_metrics, 'val')
    predictions_dfs_list = []

    from sanitytest_eval import create_eval_dataloaders

    evaluator = MetricEmbeddingEvaluator(
        model,
        args.metric_simul_sidepairs_eval,
        sidepairs_agg_method=args.sidepairs_agg,
        metric_evaluator_type=args.metric_evaluator_type)

    logit_evaluator = LogitEvaluator(model,
                                     args.metric_simul_sidepairs_eval,
                                     sidepairs_agg_method=args.sidepairs_agg)

    #figures out label column for sanity test
    def get_labelcol_eval(de_imgs_df):

        #figuring out if it is a pilltype_id or label_prod_code encoder
        #to set the label column of the sanity test set
        labels_df = pd.DataFrame({'label': label_encoder.classes_})
        img_df = pd.merge(de_imgs_df,
                          labels_df,
                          left_on=['label_prod_code'],
                          right_on=['label'],
                          how='inner')

        if len(img_df) > 1:
            labelcol = 'label_prod_code'
        else:
            labelcol = 'pilltype_id'
        print('Selecting {} for sanity test label'.format(labelcol))

        return de_imgs_df[labelcol]

    def test_model(de_imgs_df,
                   evaluator,
                   dataset_name,
                   run_metrics_list,
                   predictions_dfs_list,
                   rotate_aug=None):
        if rotate_aug is not None:
            dataset_name += "_rotate_aug{}".format(rotate_aug)

        print("Evaluating", dataset_name)
        eval_dataloader, eval_dataset = create_eval_dataloaders(
            de_imgs_df,
            label_encoder,
            torch_transform,
            'label',
            24,
            rotate_aug=rotate_aug)

        ref_dataloader, _ = create_eval_dataloaders(ref_only_df,
                                                    label_encoder,
                                                    torch_transform,
                                                    'label',
                                                    24,
                                                    rotate_aug=rotate_aug)
        dataloader = {'ref': ref_dataloader, 'eval': eval_dataloader}

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        print("Eval {}: {} images from {} total images".format(
            dataset_name, len(eval_dataset), len(de_imgs_df)))

        metrics_results, predictions = evaluator.eval_model(
            device, dataloader, do_pr_metrics=True, add_single_side_eval=True)

        plot_pr_curve(metrics_results['PR-curve'], dataset_name)

        run_metrics_list += log_metrics(metrics_results, dataset_name)

        predictions['dataset'] = dataset_name
        predictions['val_imgs_csv'] = os.path.split(args.val_imgs_csv)[1]
        predictions_dfs_list.append(predictions)

        return metrics_results, predictions

    test_model(test_df, logit_evaluator, 'holdout-logit', run_metrics_list,
               predictions_dfs_list)
    test_model(test_df, evaluator, 'holdout', run_metrics_list,
               predictions_dfs_list)

    run_metrics_df = pd.DataFrame(
        run_metrics_list, columns=['val_imgs_csv', 'dataset', 'name', 'value'])
    all_predictions_df = pd.concat(predictions_dfs_list, ignore_index=True)

    # make sure to save both
    for target_save_dir in [save_path, 'outputs']:
        print(f'saving predictions {target_save_dir}')
        # TODO: this csv can be large. Update the format for the numpy array of prediction scores.
        os.makedirs(target_save_dir, exist_ok=True)
        all_predictions_df.to_csv(
            os.path.join(
                target_save_dir, 'eval_predictions_{}'.format(
                    os.path.basename(args.val_imgs_csv))))

    torch.save(
        model.state_dict(),
        os.path.join(save_path,
                     '{}.pth'.format(os.path.basename(args.val_imgs_csv))))

    return run_metrics_df, all_predictions_df

def load_and_clean(dataset_name):
    dataframe = load_data(dataset_name)
    dataframe = extract_features(dataframe)
    features, labels = clean_data(dataframe)

    return features, labels


try:
    # Get workspace if run locally
    ws = Workspace.from_config()
except:
    # Get workspace if run remotely
    ws = Run.get_context().experiment.workspace

# Run
run = Run.get_context()

# Load and clean data
features_train, labels_train = load_and_clean('energy-forecast-data-training')
features_val, labels_val = load_and_clean('energy-forecast-data-validation')


def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument('--n_estimators',
                        type=int,
                                                    test_size=0.2,
                                                    random_state=42)
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)
test_data = X_test[1, :]
test_data_array = test_data.toarray()
test_data_list = test_data_array.tolist()
print("len test_data_list", len(test_data_list))
print("len test_data_list 0", len(test_data_list[0]))
with open("test_data.txt", "w") as fp:
    json.dump(test_data_list, fp)

run = Run.get_context(allow_offline=True)


def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--C",
        type=float,
        default=1.0,
        help=
        "Inverse of regularization strength. Smaller values cause stronger regularization"
    )
    parser.add_argument("--max_iter",
                        type=int,
def plot(y_true, y_pred, output_eval_dir):
    run = Run.get_context()
    # # Confusion matrix
    # skplt.metrics.plot_confusion_matrix(convert_sentence_to_token(y_true),
    #                                     convert_sentence_to_token(y_pred), normalize=True)
    # run.log_image("metrics/confusion_matrix", plot=plt)
    # plt.savefig(os.path.join(output_eval_dir, 'confusion_matrix.png'))
    # # plt.show()

    # Metric
    df_metrics = get_metrics(y_true, y_pred)
    type_name_list = df_metrics['type_name'].tolist()
    ps = df_metrics['precision'].tolist()
    rs = df_metrics['recall'].tolist()
    f1s = df_metrics['f1-score'].tolist()
    s = df_metrics['support'].tolist()

    # Metric F1-Score
    f1_plt = plt.figure(2)
    plt.title('F1-Score')
    plt.bar(range(len(type_name_list)), f1s, tick_label=type_name_list, fc='b')
    for x, y in zip(range(len(type_name_list)), f1s):
        plt.text(x, y, "%0.4f" % y, ha='center', va='bottom')
    plt.ylim([0, 1.1])
    plt.ylabel('F1-Score')
    plt.xlabel('Name Entity Type')
    run.log_image("metrics/f1_score", plot=f1_plt)
    f1_plt.savefig(os.path.join(output_eval_dir, 'f1_score.png'))
    # plt.show()

    # Metric Precision
    precision_plt = plt.figure(3)
    plt.title('Precision')
    plt.bar(range(len(type_name_list)), ps, tick_label=type_name_list, fc='y')
    for x, y in zip(range(len(type_name_list)), ps):
        plt.text(x, y, "%0.4f" % y, ha='center', va='bottom')
    plt.ylim([0, 1.1])
    plt.ylabel('Precision')
    plt.xlabel('Name Entity Type')
    run.log_image("metrics/precision", plot=precision_plt)
    precision_plt.savefig(os.path.join(output_eval_dir, 'precision.png'))
    # plt.show()

    # Metric Recall
    recall_plt = plt.figure(4)
    plt.title('Recall')
    plt.bar(range(len(type_name_list)), rs, tick_label=type_name_list, fc='y')
    for x, y in zip(range(len(type_name_list)), rs):
        plt.text(x, y, "%0.4f" % y, ha='center', va='bottom')
    plt.ylim([0, 1.1])
    plt.ylabel('Recall')
    plt.xlabel('Name Entity Type')
    run.log_image("metrics/recall", plot=recall_plt)
    recall_plt.savefig(os.path.join(output_eval_dir, 'recall.png'))
    # plt.show()

    # Metric AllTrueInstanceCnt
    gt_plt = plt.figure(5)
    plt.title('AllTrueInstanceCnt')
    plt.bar(range(len(type_name_list)), s, tick_label=type_name_list, fc='g')
    for x, y in zip(range(len(type_name_list)), s):
        plt.text(x, y, "%d" % y, ha='center', va='bottom')
    plt.ylabel('AllTrueInstanceCnt')
    plt.xlabel('Name Entity Type')
    run.log_image("metrics/ground_truth", plot=gt_plt)
    gt_plt.savefig(os.path.join(output_eval_dir, 'ground_truth.png'))
Beispiel #17
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="sales_model.h5",
    )

    parser.add_argument("--step_output",
                        type=str,
                        help=("output for passing data to next step"))

    parser.add_argument("--dataset_version",
                        type=str,
                        help=("dataset version"))

    parser.add_argument("--data_file_path",
                        type=str,
                        help=("data file path, if specified,\
               a new version of the dataset will be registered"))

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id"))

    parser.add_argument("--dataset_name",
                        type=str,
                        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation"))

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace,
                                          dataset_name,
                                          dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace, dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    global original_df
    original_df = dataset.to_pandas_dataframe()
    #print("Original_df")
    (train, test) = tts(original_df)

    (X_train, y_train, X_test, y_test, scaler_object) = \
        scale_data(train, test)
    #print("inside main x_test", X_test)
    model = lstm_model(train, test)

    # Log the metrics for the model
    metrics = get_model_metrics()
    #metrics = {"mse": mse}
    print(metrics)
    #for (k, v) in metrics.items():
    #print(f"{k}: {v}")
    # Train the model
    model = lstm_model(train, test)

    #Saving the model
    #   model.save("sales_forecast_model.h5")

    # Evaluate and log the metrics returned from the train function
    #metrics = get_model_metrics(model, train, test)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step

#   model_output_path = "outputs/sales_forecast_model.pkl"
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)

    keras.models.save_model(model, model_output_path)

    print("Saved model in model_output_path")

    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)

    keras.models.save_model(model, output_path)
    print("Model saved")
    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Beispiel #18
0
def display_instances(image,
                      out_folder,
                      out_filename,
                      boxes,
                      masks,
                      class_ids,
                      class_names,
                      scores=None,
                      title="",
                      figsize=(16, 16),
                      ax=None,
                      show_mask=True,
                      show_bbox=True,
                      colors=None,
                      captions=None):
    """
    boxes: [num_instance, (y1, x1, y2, x2, class_id)] in image coordinates.
    masks: [height, width, num_instances]
    class_ids: [num_instances]
    class_names: list of class names of the dataset
    scores: (optional) confidence scores for each box
    title: (optional) Figure title
    show_mask, show_bbox: To show masks and bounding boxes or not
    figsize: (optional) the size of the image
    colors: (optional) An array or colors to use with each object
    captions: (optional) A list of strings to use as captions for each object
    """
    run = Run.get_context()
    # img_plt = plt.figure(1)
    # Number of instances
    N = boxes.shape[0]
    if not N:
        print("\n*** No instances to display *** \n")
    else:
        assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]

    # If no axis is passed, create one and automatically call show()
    # auto_show = False
    if not ax:
        # img_ax = plt.figure(1)
        _, ax = plt.subplots(1, figsize=figsize)
        # img_ax = ax.figure(1)
        # auto_show = True

    # Generate random colors
    colors = colors or random_colors(N)

    # Show area outside image boundaries.
    height, width = image.shape[:2]
    ax.set_ylim(height + 10, -10)
    ax.set_xlim(-10, width + 10)
    ax.axis('off')
    ax.set_title(title)

    masked_image = image.astype(np.uint32).copy()
    for i in range(N):
        color = colors[i]

        # Bounding box
        if not np.any(boxes[i]):
            # Skip this instance. Has no bbox. Likely lost in image cropping.
            continue
        y1, x1, y2, x2 = boxes[i]
        if show_bbox:
            p = patches.Rectangle((x1, y1),
                                  x2 - x1,
                                  y2 - y1,
                                  linewidth=2,
                                  alpha=0.7,
                                  linestyle="dashed",
                                  edgecolor=color,
                                  facecolor='none')
            ax.add_patch(p)

        # Label
        if not captions:
            class_id = class_ids[i]
            score = scores[i] if scores is not None else None
            label = class_names[class_id]
            caption = "{} {:.3f}".format(label, score) if score else label
        else:
            caption = captions[i]
        ax.text(x1,
                y1 + 8,
                caption,
                color='w',
                size=11,
                backgroundcolor="none")

        # Mask
        mask = masks[:, :, i]
        if show_mask:
            masked_image = apply_mask(masked_image, mask, color)

        # Mask Polygon
        # Pad to ensure proper polygons for masks that touch image edges.
        padded_mask = np.zeros((mask.shape[0] + 2, mask.shape[1] + 2),
                               dtype=np.uint8)
        padded_mask[1:-1, 1:-1] = mask
        contours = find_contours(padded_mask, 0.5)
        for verts in contours:
            # Subtract the padding and flip (y, x) to (x, y)
            verts = np.fliplr(verts) - 1
            p = Polygon(verts, facecolor="none", edgecolor=color)
            ax.add_patch(p)
    ax.imshow(masked_image.astype(np.uint8))
    fig = ax.get_figure()
    run.log_image("prediction/" + out_filename, plot=fig)
    fig.savefig(os.path.join(out_folder, '{}.jpg'.format(out_filename)))
def main(
    training_data_path=None,
    validation_data_path=None,
    use_gpu=False,
    save_filepath=None,
    model="resnet50",
    epochs=_EPOCHS,
    batch_size=_BATCHSIZE,
    fp16_allreduce=False,
    base_lr=0.0125,
    warmup_epochs=5,
):
    logger = logging.getLogger(__name__)

    device = torch.device("cuda" if use_gpu else "cpu")
    logger.info(f"Running on {device}")
    if _DISTRIBUTED:
        # Horovod: initialize Horovod.

        logger.info("Running Distributed")
        torch.manual_seed(_SEED)
        if use_gpu:
            # Horovod: pin GPU to local rank.
            torch.cuda.set_device(hvd.local_rank())
            torch.cuda.manual_seed(_SEED)

    logger.info("PyTorch version {}".format(torch.__version__))

    # Horovod: write TensorBoard logs on first worker.
    if (_DISTRIBUTED and hvd.rank() == 0) or not _DISTRIBUTED:
        run = Run.get_context()
        run.tag("model", value=model)

        logs_dir = os.path.join(os.curdir, "logs")
        if os.path.exists(logs_dir):
            logger.debug(f"Log directory {logs_dir} found | Deleting")
            shutil.rmtree(logs_dir)
        summary_writer = SummaryWriter(logdir=logs_dir)

    if training_data_path is None:
        logger.info("Setting up fake loaders")
        train_dataset = FakeData(n_classes=1000,
                                 data_transform=torch.FloatTensor)
        validation_dataset = None
    else:
        normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
        logger.info("Setting up loaders")
        logger.info(f"Loading training from {training_data_path}")
        train_dataset = datasets.ImageFolder(
            training_data_path,
            transforms.Compose([
                transforms.RandomResizedCrop(_WIDTH),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ]),
        )

        if validation_data_path is not None:
            logger.info(f"Loading validation from {validation_data_path}")
            validation_dataset = datasets.ImageFolder(
                validation_data_path,
                transforms.Compose([
                    transforms.Resize(256),
                    transforms.CenterCrop(224),
                    transforms.ToTensor(),
                    normalize,
                ]),
            )

    train_sampler = _get_sampler(train_dataset)
    kwargs = {"num_workers": 5, "pin_memory": True}
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    if validation_data_path is not None:
        val_sampler = _get_sampler(validation_dataset)
        val_loader = torch.utils.data.DataLoader(validation_dataset,
                                                 batch_size=batch_size,
                                                 sampler=val_sampler,
                                                 **kwargs)

    # Autotune
    cudnn.benchmark = True

    logger.info("Loading model")

    # Load symbol
    model = models.__dict__[model](pretrained=False)

    # model.to(device)
    if use_gpu:
        # Move model to GPU.
        model.cuda()

    # # Horovod: (optional) compression algorithm.
    # compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    num_gpus = hvd.size() if _DISTRIBUTED else 1
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus, momentum=0.9)
    if _DISTRIBUTED:

        compression = hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none

        # Horovod: wrap optimizer with DistributedOptimizer.
        optimizer = hvd.DistributedOptimizer(
            optimizer,
            named_parameters=model.named_parameters(),
            compression=compression,
        )

        # Horovod: broadcast parameters & optimizer state.
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    criterion = F.cross_entropy

    # Main training-loop
    logger.info("Training ...")
    for epoch in range(epochs):
        with Timer(output=logger.info, prefix=f"Training epoch {epoch} ") as t:
            model.train()
            if _DISTRIBUTED:
                train_sampler.set_epoch(epoch)
            metrics = train(train_loader, model, criterion, optimizer, base_lr,
                            warmup_epochs, epoch)

            if (_DISTRIBUTED and hvd.rank() == 0) or not _DISTRIBUTED:
                run.log_row("Training metrics", epoch=epoch, **metrics)
                summary_writer.add_scalar("Train/Loss", metrics["loss"], epoch)
                summary_writer.add_scalar("Train/Acc", metrics["acc"], epoch)
                summary_writer.add_scalar("Train/BatchTime",
                                          metrics["batch_time"], epoch)

        if validation_data_path is not None:
            model.eval()
            metrics = validate(val_loader, model, criterion, device)
            if (_DISTRIBUTED and hvd.rank() == 0) or not _DISTRIBUTED:
                run.log_row("Validation metrics", epoch=epoch, **metrics)
                summary_writer.add_scalar("Validation/Loss", metrics["loss"],
                                          epoch)
                summary_writer.add_scalar("Validation/Acc", metrics["acc"],
                                          epoch)

        if save_filepath is not None:
            save_checkpoint(model, optimizer, save_filepath)

    _log_summary(epochs * len(train_dataset), t.elapsed, batch_size)
def main():
    parser = argparse.ArgumentParser("RAPIDS_DBSCAN")
    parser.add_argument("--data_dir", type=str, help="Location of data")
    parser.add_argument('-f', type=str,
                        default='')  # added for notebook execution scenarios

    args = parser.parse_args()
    data_dir = args.data_dir

    run = Run.get_context()

    # specify the location of the data files
    DATA_PATH = data_dir

    # the sample PCAP file used for explanation
    DATA_PCAP = DATA_PATH + "/small_sample.pcap"

    # the flow connection log (conn.log) file
    DATA_SOURCE = DATA_PATH + "/conn.log"

    # the data label file (matches IP addresses with MAC addresses)
    DATA_LABELS = DATA_PATH + "/lab_mac_labels_cats.csv"

    print("Running NETWORK FLOW on GPU...")
    t1 = datetime.now()

    # ### Background

    ##### Types of Network Data
    # The most detailed type of data that is typically collected on a network is full Packet CAPture (PCAP) data. This information is detailed and contains everything about the communication, including: source address, destination address, protocols used, bytes transferred, and even the raw data (e.g., image, audio file, executable). PCAP data is fine-grained, meaning that there is a record for each frame being transmitted. A typical communication is composed of many individual packets/frames.
    #
    # If we aggregate PCAP data so that there is one row of data per communication session, we call that flow level data. A simplified example of this relationship is shown in the figure below.
    #
    # ![PCAP_flow_relationship](images/pcap_vs_flow.png "PCAP vs FLOW")
    #
    # For this tutorial, we use data from the University of New South Wales. In a lab environment, they [collected nearly three weeks of IoT data from 21 IoT devices](http://149.171.189.1). They also kept a detailed [list of devices by MAC address](http://149.171.189.1/resources/List_Of_Devices.txt), so we have ground-truth with respect to each IoT device's behavior on the network.
    #
    # **Our goal is to utilize the behavior exhibited in the network data to classify IoT devices.**

    ##### The Internet of Things and Data at a Massive Scale
    # Gartner estimates there are currently over 8.4 billion Internet of Things (IoT) devices. By 2020, that number is [estimated to surpass 20 billion](https://www.zdnet.com/article/iot-devices-will-outnumber-the-worlds-population-this-year-for-the-first-time/). These types of devices range from consumer devices (e.g., Amazon Echo, smart TVs, smart cameras, door bells) to commercial devices (e.g., building automation systems, keycard entry). All of these devices exhibit behavior on the Internet as they communicate back with their own clouds and user-specified integrations.

    ### Data Investigation

    # Let's first see some of the data. We'll load a PCAP file in using Scapy. If you don't want to or can't install Scapy, feel free to skip this section.
    cap = rdpcap(DATA_PCAP)

    # get the frames
    eth_frame = cap[3]
    ip_pkt = eth_frame.payload
    segment = ip_pkt.payload
    data = segment.payload

    print(eth_frame.show())

    # There's really a lot of features there. In addition to having multiple layers (which may differ between packets), there are a number of other issues with working directly with PCAP. Often the payload (the `Raw` section above) is encrypted, rendering it useless. The lack of aggregation also makes it difficult to differentiate between packets. What we really care about for this application is what a *session* looks like. In other words, how a Roku interacts with the network is likely quite different than how a Google Home interacts.
    #
    # To save time for the tutorial, all three weeks of PCAP data have already been transformed to flow data, and we can load that in to a typical Pandas dataframe. Due to how the data was created, we have a header row (with column names) as well as a footer row. We've already removed those rows, so nothing to do here.
    #
    # For this application, we used [Zeek](https://www.zeek.org) (formerly known as Bro) to construct the flow data. To include MAC addresses in the conn log, we used the [mac-logging.zeek script](https://github.com/bro/bro/blob/master/scripts/policy/protocols/conn/mac-logging.zeek).
    #
    #     # If you've skipped installing Scapy, you can pick up here.
    #     pdf = pd.read_csv(DATA_SOURCE, sep=\'\t')
    #     print("==> pdf shape: ", pdf.shape)

    #     # We can look at what this new aggregated data looks like, and get a better sense of the columns and their data types. Let's do this the way we're familiar with, using Pandas.
    #     print(pdf.head())
    #     pdf.dtypes

    # That's Pandas, and we could continue the analysis there if we wanted. But what about  [cuDF](https://github.com/rapidsai/cudf)? Let's pivot to that for the majority of this tutorial.
    #
    # One thing cuDF neeeds is for us to specify the data types. We'll write a function to make this easier. As of version 0.6, [strings are supported in cuDF](https://rapidsai.github.io/projects/cudf/en/latest/10min.html?highlight=string#String-Methods). We'll make use of that here.
    def get_dtypes(fn, delim, floats, strings):
        with open(fn, errors='replace') as fp:
            header = fp.readline().strip()

        types = []
        for col in header.split(delim):
            if 'date' in col:
                types.append((col, 'date'))
            elif col in floats:
                types.append((col, 'float64'))
            elif col in strings:
                types.append((col, 'str'))
            else:
                types.append((col, 'int64'))

        return OrderedDict(types)

    dtypes_data_processed = get_dtypes(DATA_SOURCE,
                                       '\t',
                                       floats=['ts', 'duration'],
                                       strings=[
                                           'uid', 'id.orig_h', 'id.resp_h',
                                           'proto', 'service', 'conn_state',
                                           'local_orig', 'local_resp',
                                           'history', 'tunnel_parents',
                                           'orig_l2_addr', 'resp_l2_addr'
                                       ])

    raw_cdf = cd.io.csv.read_csv(DATA_SOURCE,
                                 delimiter='\t',
                                 names=list(dtypes_data_processed),
                                 dtype=list(dtypes_data_processed.values()),
                                 skiprows=1)

    # Those data types seem right. Let's see what this data looks like now that it's in cuDF.
    # ### Adding ground truth labels back to the data

    # We'll need some labels for our classification task, so we've already prepared a file with those labels.
    dtypes_labels_processed = get_dtypes(
        DATA_LABELS,
        ',',
        floats=[],
        strings=['device', 'mac', 'connection', 'category'])

    labels_cdf = cd.io.csv.read_csv(DATA_LABELS,
                                    delimiter=',',
                                    names=list(dtypes_labels_processed),
                                    dtype=list(
                                        dtypes_labels_processed.values()),
                                    skiprows=1)

    print('Labels...')
    print(labels_cdf.head())

    # We now perform a series of merges to add the ground truth data (device name, connection, category, and categoryID) back to the dataset. Since each row of netflow has two participants, we'll have to do this twice - once for the originator (source) and once for the responder (destination).
    labels_cdf.columns = [
        'orig_device', 'orig_l2_addr', 'orig_connection', 'orig_category',
        'orig_category_id'
    ]
    merged_cdf = cd.merge(raw_cdf, labels_cdf, how='left', on='orig_l2_addr')
    labels_cdf.columns = [
        'resp_device', 'resp_l2_addr', 'resp_connection', 'resp_category',
        'resp_category_id'
    ]
    merged_cdf = cd.merge(merged_cdf, labels_cdf, how='left')
    labels_cdf.columns = [
        'device', 'mac', 'connection', 'category', 'category_id'
    ]

    # Let's just look at our new dataset to make sure everything's okay.
    print('Merged...')
    print(merged_cdf.head())

    # ### Exploding the Netflow Data into Originator and Responder Rows

    # We now have netflow that has one row per (sessionized) communication between an originator and responder. However, in order to classify an individual device, we need to explode data. Instead of one row that contains both originator and responder, we'll explode to one row for originator information (orig_bytes, orig_pkts, orig_ip_bytes) and one for responder information (resp_bytes, resp_pkts, resp_ip_bytes).
    #
    # The easiest way to do this is to create two new dataframes, rename all of the columns, then `concat` them back together. Just for sanity, we'll also check the new shape of our exploded data frame.
    orig_comms_cdf = merged_cdf[[
        'ts', 'id.orig_h', 'id.orig_p', 'proto', 'service', 'duration',
        'orig_bytes', 'orig_pkts', 'orig_ip_bytes', 'orig_device',
        'orig_l2_addr', 'orig_category', 'orig_category_id'
    ]]
    orig_comms_cdf.columns = [
        'ts', 'ip', 'port', 'proto', 'service', 'duration', 'bytes', 'pkts',
        'ip_bytes', 'device', 'mac', 'category', 'category_id'
    ]

    resp_comms_cdf = merged_cdf[[
        'ts', 'id.resp_h', 'id.resp_p', 'proto', 'service', 'duration',
        'resp_bytes', 'resp_pkts', 'resp_ip_bytes', 'resp_device',
        'resp_l2_addr', 'resp_category', 'resp_category_id'
    ]]
    resp_comms_cdf.columns = [
        'ts', 'ip', 'port', 'proto', 'service', 'duration', 'bytes', 'pkts',
        'ip_bytes', 'device', 'mac', 'category', 'category_id'
    ]

    exploded_cdf = cd.multi.concat([orig_comms_cdf, resp_comms_cdf])
    print("==> shape (original) =", merged_cdf.shape)
    print("==> shape =", exploded_cdf.shape)

    num_categories = labels_cdf['category_id'].unique().shape[0]
    print("==> number of IoT categories =", num_categories)

    # We currently need to remove null values before we proceed. Although `dropna` doesn't exist in cuDF yet, we can use a workaround to get us there. Also, due to what's available currently, we can't have any nulls in any place in the DF.
    print('Check if any missing...')
    for col in exploded_cdf.columns:
        print(col, exploded_cdf[col].null_count)

    exploded_cdf['category_id'] = exploded_cdf['category_id'].fillna(-999)
    exploded_cdf['device'] = exploded_cdf['device'].str.fillna("none")
    exploded_cdf['category'] = exploded_cdf['category'].str.fillna("none")

    print('After missing observations imputation...')
    for col in exploded_cdf.columns:
        print(col, exploded_cdf[col].null_count)

    # Looks like all the null values are gone, so now we can proceed. If an IP doesn't have a category ID, we can't use it. So we'll filter those out.
    exploded_cdf = exploded_cdf[exploded_cdf['category_id'] != -999]

    # ### Binning the Data and Aggregating the Features
    #

    # But wait, there's still more data wrangling to be done! While we've exploded the flows into rows for orig/resp, we may want to bin the data further by time. The rationale is that any single communication may not be an accurate representation of how a device typically reacts in its environment. Imagine the simple case of how a streaming camera typically operates (most of its data will be uploaded from the device to a destination) versus how it operates during a firmware update (most of the data will be pushed down to the device, after which a brief disruption in connectivity will occur).
    #
    # There's a lof ot different time binning we could do. It also would be useful to investigate what the average duration of connection is relative to how many connections per time across various time granularities. With that said, we'll just choose a time bin of 1 hour to begin with. In order to bin, we'll use the following formula:
    #
    # $$\text{hour_time_bin}=\left\lfloor{\frac{ts}{60*60}}\right\rfloor$$
    exploded_cdf['hour_time_bin'] = exploded_cdf['ts'].applymap(
        lambda x: math.floor(x / (60 * 60))).astype(int)

    # We also have to make a choice about how we'll aggregate the binned data. One of the simplest ways is to sum the bytes and packets. There are really two choices for bytes, `bytes` and `ip_bytes`. With Bro, `bytes` is taken from the TCP sequence numbers and is potentially inaccurate, so we select `ip_bytes` instead for both originator and responder. We'll also use the sum of the number of packets.
    one_hour_time_bin_cdf = (exploded_cdf[[
        'bytes', 'pkts', 'ip_bytes', 'mac', 'category_id', 'hour_time_bin'
    ]].groupby(['mac', 'category_id', 'hour_time_bin']).agg({
        'category_id': 'min',
        'bytes': 'sum',
        'pkts': 'sum',
        'ip_bytes': 'sum'
    })[['min_category_id', 'sum_bytes', 'sum_pkts', 'sum_ip_bytes']])

    one_hour_time_bin_cdf.columns = [
        'category_id', 'bytes', 'pkts', 'ip_bytes'
    ]

    # ### Creating the Training and Testing Datasets

    # We'll take a traditional 70/30 train/test split, and we'll randomly sample into a train and test data frame.
    cdf_msk = np.random.rand(len(one_hour_time_bin_cdf)) < 0.7
    train_cdf = one_hour_time_bin_cdf[cdf_msk]
    test_cdf = one_hour_time_bin_cdf[~cdf_msk]

    print("==> train length =", len(train_cdf))
    print("==> test length =", len(test_cdf))

    run.log('Train length', len(train_cdf))
    run.log('Test length', len(test_cdf))

    # Prepare the training input (`train_X`), training target (`train_Y`), test input (`test_X`) and test target (`test_Y`) datasets.
    train_X = train_cdf[['pkts', 'ip_bytes']]
    train_Y = train_cdf[['category_id']]

    test_X = test_cdf[['pkts', 'ip_bytes']]
    test_Y = test_cdf[['category_id']]

    # ### Configure XGBoost

    # We choose a classification algorithm that utilizes the GPU - [XGBoost](https://xgboost.readthedocs.io/en/latest/). The package provides support for gradient boosted trees and can leverage distributed GPU compute environments.

    # Getting data into a format for XGBoost is really easy. Just make a `DMatrix` for both training and testin.
    xg_train = xgb.DMatrix(train_X, label=train_Y)
    xg_test = xgb.DMatrix(test_X, label=test_Y)

    # Like any good ML package, there's quite a few parameters to set. We're going to start with the softmax objective function. This will let us get a predicted category out of our model. We'll also set other parameters like the maximum depth and number of threads. You can read more about the parameters [here](https://xgboost.readthedocs.io/en/latest/parameter.html). Experiment with them!

    param = {}
    param['objective'] = 'multi:softmax'
    param['eta'] = 0.1
    param['max_depth'] = 8
    param['silent'] = 1
    param['nthread'] = 4
    param['num_class'] = num_categories
    param['max_features'] = 'auto'
    param['n_gpus'] = 1
    param['tree_method'] = 'gpu_hist'

    # XGBoost allows us to define a watchlist so what we can keep track of performance as the algorithm trains. We'll configure a simple watchlist that is watching `xg_train` and `xg_gest` error rates.
    watchlist = [(xg_train, 'train'), (xg_test, 'test')]
    num_round = 20

    # ### Training our First XGBoost Model

    # Now it's time to train
    bst = xgb.train(param, xg_train, num_round, watchlist)

    # Prediction is also easy (and fast).
    pred = bst.predict(xg_test)

    # We might want to get a sense of how our model is by calculating the error rate.
    pred_cdf = cd.from_pandas(pd.DataFrame(pred, columns=['pred']))
    pred_cdf.add_column('category_id', test_Y['category_id'])
    error_rate = (pred_cdf[pred_cdf['pred'] != pred_cdf['category_id']]
                  ['pred'].count()) / test_Y.shape[0]
    run.log('Error rate', error_rate)
    t2 = datetime.now()

    run.log('Runtime', t2 - t1)
Beispiel #21
0
def run_train_from_args(
        args,
        hyperdrive_hyperparameter_overrides: Dict[str, str] = {}) -> None:
    # Get the housekeeping going and start logging:
    os.makedirs(args.save_dir, exist_ok=True)
    run_id = make_run_id(args.model, args.task, args.run_name)
    log_file = os.path.join(args.save_dir, f"{run_id}.log")

    def log(msg):
        log_line(log_file, msg)

    log(f"Setting random seed {args.random_seed}.")
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    tf.random.set_seed(args.random_seed)

    data_path = RichPath.create(args.data_path, args.azure_info)
    dataset, model = get_model_and_dataset(
        msg_passing_implementation=args.model,
        task_name=args.task,
        data_path=data_path,
        trained_model_file=args.load_saved_model,
        cli_data_hyperparameter_overrides=args.data_param_override,
        cli_model_hyperparameter_overrides=args.model_param_override,
        hyperdrive_hyperparameter_overrides=hyperdrive_hyperparameter_overrides,
        folds_to_load={DataFold.TRAIN, DataFold.VALIDATION},
        load_weights_only=args.load_weights_only,
    )

    log(f"Dataset parameters: {json.dumps(unwrap_tf_tracked_data(dataset._params))}"
        )
    log(f"Model parameters: {json.dumps(unwrap_tf_tracked_data(model._params))}"
        )

    if args.azureml_logging:
        from azureml.core.run import Run

        aml_run = Run.get_context()
    else:
        aml_run = None

    trained_model_path = train(
        model,
        dataset,
        log_fun=log,
        run_id=run_id,
        max_epochs=args.max_epochs,
        patience=args.patience,
        save_dir=args.save_dir,
        quiet=args.quiet,
        aml_run=aml_run,
    )

    if args.run_test:
        data_path = RichPath.create(args.data_path, args.azure_info)
        log("== Running on test dataset")
        log(f"Loading data from {data_path}.")
        dataset.load_data(data_path, {DataFold.TEST})
        log(f"Restoring best model state from {trained_model_path}.")
        load_weights_verbosely(trained_model_path, model)

        # Test 1: Simply compute same metrics used during training/validation:
        test_data = dataset.get_tensorflow_dataset(DataFold.TEST)
        _, _, test_results = model.run_one_epoch(test_data,
                                                 training=False,
                                                 quiet=args.quiet)
        test_metric, test_metric_string = model.compute_epoch_metrics(
            test_results)
        log(test_metric_string)
        if aml_run is not None:
            aml_run.log("task_test_metric", float(test_metric))

        # Test 2: Try to compute fancier metrics, if implemented:
        try:
            eval_metrics = model.evaluate_model(test_data)
            for metric_name, metric_value in eval_metrics.items():
                log(f"{metric_name:<30}: {metric_value:8.4f}")
                if aml_run is not None:
                    aml_run.log(f"task_test_{metric_name}", metric_value)
        except NotImplementedError:
            pass  # ignore if there are no fancier metrics
 def __init__(self, args):
     self.args = args
     self.run = Run.get_context()
     super(AMLChannel, self).__init__(args)
     self.current_message_index = -1
Beispiel #23
0
def run(arguments):
    if arguments["--aml"]:
        from azureml.core.run import Run

        aml_ctx = Run.get_context()
        assert torch.cuda.is_available(), "No CUDA available. Aborting training."
    else:
        aml_ctx = None

    log_path = configure_logging(aml_ctx)
    azure_info_path = arguments.get("--azure-info", None)
    training_data_path = RichPath.create(arguments["TRAIN_DATA_PATH"], azure_info_path)
    training_data = LazyDataIterable(lambda: load_from_folder(training_data_path, shuffle=True))

    validation_data_path = RichPath.create(arguments["VALID_DATA_PATH"], azure_info_path)
    validation_data = LazyDataIterable(
        lambda: load_from_folder(validation_data_path, shuffle=False)
    )

    model_path = Path(arguments["MODEL_FILENAME"])
    assert model_path.name.endswith(".pkl.gz"), "MODEL_FILENAME must have a `.pkl.gz` suffix."

    initialize_metadata = True
    restore_path = arguments.get("--restore-path", None)
    if restore_path:
        initialize_metadata = False
        model, nn = Graph2Class.restore_model(Path(restore_path))
    elif arguments["--aml"] and model_path.exists():
        initialize_metadata = False
        model, nn = Graph2Class.restore_model(model_path)
    else:
        nn = None
        model = create_graph2class_gnn_model()

    def create_optimizer(parameters):
        return torch.optim.Adam(parameters, lr=0.00025)

    trainer = ModelTrainer(
        model,
        model_path,
        max_num_epochs=int(arguments["--max-num-epochs"]),
        minibatch_size=int(arguments["--minibatch-size"]),
        optimizer_creator=create_optimizer,
        clip_gradient_norm=1,
        target_validation_metric="Accuracy",
        target_validation_metric_higher_is_better=True,
        enable_amp=arguments["--amp"],
    )
    if nn is not None:
        trainer.neural_module = nn

    trainer.register_train_epoch_end_hook(
        lambda model, nn, epoch, metrics: log_run(aml_ctx, "train", model, epoch, metrics)
    )
    trainer.register_validation_epoch_end_hook(
        lambda model, nn, epoch, metrics: log_run(aml_ctx, "valid", model, epoch, metrics)
    )

    trainer.train(
        training_data,
        validation_data,
        show_progress_bar=not arguments["--quiet"],
        initialize_metadata=initialize_metadata,
        parallelize=not arguments["--sequential-run"],
        patience=10,
        store_tensorized_data_in_memory=True,
    )

    test_data_path = RichPath.create(arguments["TEST_DATA_PATH"], azure_info_path)
    test_data = LazyDataIterable(lambda: load_from_folder(test_data_path, shuffle=False))
    acc = model.report_accuracy(
        test_data,
        trainer.neural_module,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    )
    print(f"Test accuracy: {acc:%}")

    if aml_ctx is not None:
        aml_ctx.log("Test Accuracy", acc)
        aml_ctx.upload_file(name="model.pkl.gz", path_or_stream=str(model_path))
        aml_ctx.upload_file(name="full.log", path_or_stream=log_path)
Beispiel #24
0
def run_train_from_args(
        args,
        hyperdrive_hyperparameter_overrides: Dict[str, str] = {}) -> None:
    # Get the housekeeping going and start logging:
    os.makedirs(args.save_dir, exist_ok=True)
    run_id = make_run_id(args.model, args.task)
    log_file = os.path.join(args.save_dir, f"{run_id}.log")

    def log(msg):
        log_line(log_file, msg)

    log(f"Setting random seed {args.random_seed}.")
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    tf.random.set_seed(args.random_seed)
    #data split
    a = args.data_path
    DataSplit.Preprocess(args.data_path)

    data_path = RichPath.create(
        os.path.split(args.data_path)[0] + '/tem/ast', args.azure_info)
    #second path
    data_path_2 = RichPath.create(
        os.path.split(args.data_path)[0] + '/tem/cdfg', args.azure_info)

    try:
        dataset, model = get_model_and_dataset(
            msg_passing_implementation=args.model,
            task_name=args.task,
            data_path=data_path,
            trained_model_file=args.load_saved_model,
            cli_data_hyperparameter_overrides=args.data_param_override,
            cli_model_hyperparameter_overrides=args.model_param_override,
            hyperdrive_hyperparameter_overrides=
            hyperdrive_hyperparameter_overrides,
            folds_to_load={DataFold.TRAIN, DataFold.VALIDATION},
            load_weights_only=args.load_weights_only,
            case_name=args.case,  # add by zjq
        )
        #second
        dataset2, model_2 = get_model_and_dataset(
            msg_passing_implementation=args.model,
            task_name=args.task,
            data_path=data_path_2,
            trained_model_file=args.load_saved_model,
            cli_data_hyperparameter_overrides=args.data_param_override,
            cli_model_hyperparameter_overrides=args.model_param_override,
            hyperdrive_hyperparameter_overrides=
            hyperdrive_hyperparameter_overrides,
            folds_to_load={DataFold.TRAIN, DataFold.VALIDATION},
            load_weights_only=args.load_weights_only,
            case_name=args.case,  # add by zjq
        )
    except ValueError as err:
        print(err.args)

    log(f"Dataset parameters: {json.dumps(unwrap_tf_tracked_data(dataset._params))}"
        )
    log(f"Model parameters: {json.dumps(unwrap_tf_tracked_data(model._params))}"
        )

    if args.azureml_logging:
        from azureml.core.run import Run

        aml_run = Run.get_context()
    else:
        aml_run = None
    # add by zjq
    if not args.load_trained_model:
        trained_model_path = train(
            model,
            dataset,
            dataset2,
            log_fun=log,
            run_id=run_id,
            max_epochs=args.max_epochs,
            patience=args.patience,
            save_dir=args.save_dir,
            quiet=args.quiet,
            aml_run=aml_run,
        )
    else:
        trained_model_path = args.load_trained_model

    if args.run_test:
        data_path = RichPath.create(
            os.path.split(args.data_path)[0] + '/tem/ast', args.azure_info)
        data_path_2 = RichPath.create(
            os.path.split(args.data_path)[0] + '/tem/cdfg', args.azure_info)
        log("== Running on test dataset")
        log(f"Loading data from {data_path}.")
        dataset.load_data(data_path, {DataFold.TEST})
        dataset2.load_data(data_path_2, {DataFold.TEST})
        log(f"Restoring best model state from {trained_model_path}.")
        load_weights_verbosely(trained_model_path, model)
        test_data_1 = dataset.get_tensorflow_dataset(DataFold.TEST)
        test_data_2 = dataset2.get_tensorflow_dataset(DataFold.TEST)
        _, _, test_results = model.run_one_epoch_new(test_data_1,
                                                     test_data_2,
                                                     training=False,
                                                     quiet=args.quiet)

        valid_ACC, val_stracc, \
        best_valid_Pre, best_val_strpre, \
        best_valid_metric_RE, best_val_strre, \
        best_valid_metric_f1, best_val_strf1, \
        best_valid_metric_TPR, best_val_strtpr, \
        best_valid_metric_FPR, best_val_strfpr, \
        best_valid_metric_TNR, best_val_strtnr, \
        best_valid_metric_FNR, best_val_strfnr, = model.compute_epoch_metrics(test_results)
        # valid_metric, valid_metric_string = model.compute_epoch_metrics(valid_results)
        log(
            f"  {val_stracc}|{best_val_strpre} | {best_val_strre} | {best_val_strf1} |"
            f"{best_val_strtpr} | {best_val_strfpr} | {best_val_strtnr} | {best_val_strfnr} |",
        )
        # test_metric, test_metric_string = model.compute_epoch_metrics(test_results)
        # log(val_stracc)
        if aml_run is not None:
            aml_run.log("task_test_metric", float(valid_ACC))
Beispiel #25
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    # See lightgbm library for python for a list of parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html
    parser.add_argument('--n_estimators',
                        type=int,
                        default=100,
                        help="number of boosting iterations")
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help="shrinkage rate")
    parser.add_argument('--max_depth',
                        type=int,
                        default=-1,
                        help="max depth for tree model")
    parser.add_argument(
        '--subsample',
        type=float,
        default=1.0,
        help=
        "randomly select part of data without resampling. useful to speed up training and prevent over-fitting"
    )

    args = parser.parse_args()

    run = Run.get_context()

    run.log("n_estimators:", np.int(args.n_estimators))
    run.log(
        "learning_rate:", np.float(args.learning_rate)
    )  # see here for more ideas = https://bit.ly/3c2zJOm & https://bit.ly/3o6OAth
    run.log("max_depth:", np.int(args.max_depth))
    run.log("subsample:", np.float(args.subsample))

    # training set
    train_split_data = run.input_datasets["output_split_train"]
    # train_split_data = train_split_data.parse_parquet_files()
    train_split_df = train_split_data.to_pandas_dataframe()
    print(train_split_df.head(10))

    x_train = train_split_df.loc[:, train_split_df.columns != 'Exited']
    y_train = train_split_df.loc[:, train_split_df.columns == 'Exited']

    #evaluation set
    test_split_data = run.input_datasets["output_split_test"]
    test_split_df = test_split_data.to_pandas_dataframe()

    x_test = test_split_df.loc[:, test_split_df.columns != 'Exited']
    y_test = test_split_df.loc[:, test_split_df.columns == 'Exited']

    print(x_train.head(10))
    print(x_test.head(10))

    # declaring our model with parameters - default and those declared in our hyperparameter space
    model = LGBMClassifier(n_estimators=args.n_estimators,
                           learning_rate=args.learning_rate,
                           max_depth=args.max_depth,
                           subsample=args.subsample).fit(x_train, y_train)

    # save model
    os.makedirs('./outputs/model', exist_ok=True)

    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(model, './outputs/model/saved_model.joblib')

    accuracy = model.score(x_test, y_test)
    print(model)
    print(x_test.head(10))

    run.log("Accuracy", np.float(
        accuracy))  #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx

    y_pred = model.predict(x_test)
    auc_weighted = roc_auc_score(y_test, y_pred, average='weighted')
    run.log("AUC_weighted", np.float(auc_weighted)
            )  #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx

    # creating a confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
Beispiel #26
0
        (tgt / '__init__.py').touch(exist_ok=False)

    paths_to_copy = list(src.glob(glob_pattern))
    logger.info(f"Copying to {tgt} the following files: {str(paths_to_copy)}")
    for p in paths_to_copy:
        destpath = tgt / p.relative_to(src)
        destpath.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy(p, destpath)


def is_offline_run(run: Run) -> bool:
    return run.id.startswith("OfflineRun")


# Get the current run.
RUN = Run.get_context()

from cgmml.common.evaluation.eval_utilities import (  # noqa: E402, F401
    is_ensemble_evaluation, is_multiartifact_evaluation)
from cgmml.common.evaluation.evaluation_classes import (  # noqa: E402, F401
    Evaluation, EnsembleEvaluation, MultiartifactEvaluation)
from cgmml.common.model_utils.run_initialization import OfflineRunInitializer, OnlineRunInitializer  # noqa: E402

logging.basicConfig(
    level=logging.INFO,
    format=
    '%(asctime)s - %(levelname)s - %(message)s - %(pathname)s: line %(lineno)d'
)

QA_CONFIG_MODULES = [
    'qa_config_weight',  # takes 14min in CI
 def on_init_end(self, args, state, control, **kwargs):
     if self.azureml_run is None and state.is_world_process_zero:
         self.azureml_run = Run.get_context()
import glob
import os

import pandas as pd
from azureml.core.run import Run

from lightning_base import set_seed
from lightning_glue import GLUETransformer, parse_args
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import LightningLoggerBase
from pytorch_lightning.utilities import rank_zero_only

# get the Azure ML run object
child_run = Run.get_context()
run = child_run.parent


class AzureMLLogger(LightningLoggerBase):
    def __init__(self):
        super().__init__()

    @rank_zero_only
    def log_hyperparams(self, params):
        pass

    @rank_zero_only
    def log_metrics(self, metrics, step):
        for k, v in {**{"step": step}, **metrics}.items():
            run.log(k, v)

    @property
Beispiel #29
0
def main():
    print("Running train_aml.py")

    parser = argparse.ArgumentParser("train")
    parser.add_argument(
        "--model_name",
        type=str,
        help="Name of the Model",
        default="sales_model.h5",
    )

    parser.add_argument(
        "--step_output",
        type=str,
        help=("output for passing data to next step")
    )

    parser.add_argument(
        "--dataset_version",
        type=str,
        help=("dataset version")
    )

    parser.add_argument(
        "--data_file_path",
        type=str,
        help=("data file path, if specified,\
               a new version of the dataset will be registered")
    )

    parser.add_argument(
        "--caller_run_id",
        type=str,
        help=("caller run id, for example ADF pipeline run id")
    )

    parser.add_argument(
        "--dataset_name",
        type=str,
        help=("Dataset name. Dataset must be passed by name\
              to always get the desired dataset version\
              rather than the one used while the pipeline creation")
    )

    args = parser.parse_args()

    print("Argument [model_name]: %s" % args.model_name)
    print("Argument [step_output]: %s" % args.step_output)
    print("Argument [dataset_version]: %s" % args.dataset_version)
    print("Argument [data_file_path]: %s" % args.data_file_path)
    print("Argument [caller_run_id]: %s" % args.caller_run_id)
    print("Argument [dataset_name]: %s" % args.dataset_name)

    model_name = args.model_name
    step_output_path = args.step_output
    dataset_version = args.dataset_version
    data_file_path = args.data_file_path
    dataset_name = args.dataset_name

    run = Run.get_context()

    print("Getting training parameters")

    # Load the training parameters from the parameters file
    with open("parameters.json") as f:
        pars = json.load(f)
    try:
        train_args = pars["training"]
    except KeyError:
        print("Could not load training values from file")
        train_args = {}

    # Log the training parameters
    print(f"Parameters: {train_args}")
    for (k, v) in train_args.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Get the dataset
    if (dataset_name):
        if (data_file_path == 'none'):
            dataset = Dataset.get_by_name(run.experiment.workspace, dataset_name, dataset_version)  # NOQA: E402, E501
        else:
            dataset = register_dataset(run.experiment.workspace,
                                       dataset_name,
                                       os.environ.get("DATASTORE_NAME"),
                                       data_file_path)
    else:
        e = ("No dataset provided")
        print(e)
        raise Exception(e)

    # Link dataset to the step run so it is trackable in the UI
    run.input_datasets['training_data'] = dataset
    run.parent.tag("dataset_id", value=dataset.id)

    # Split the data into test/train
    df = dataset.to_pandas_dataframe()
    (train, test) = tts(df)

    # Train the model
    model = lstm_model(train, test)

    #Saving the model
 #   model.save("sales_forecast_model.h5")

    # Evaluate and log the metrics returned from the train function
    metrics = get_model_metrics(model, train, test)
    for (k, v) in metrics.items():
        run.log(k, v)
        run.parent.log(k, v)

    # Pass model file to next step
 #   model_output_path = "outputs/sales_forecast_model.pkl"
    os.makedirs(step_output_path, exist_ok=True)
    model_output_path = os.path.join(step_output_path, model_name)

    keras.models.save_model(model,model_output_path)

    print("Saved model in model_output_path")

    #print("printing output path:  ")
    #print(model_output_path)
    #print("printing model name: ")
    #print(model_name)
    #joblib.dump(value=model, filename=model_output_path)
    #checkpoints = ModelCheckpoint(model_output_path, verbose=1, 
     #                         save_best_only=False,
      #                        save_weights_only=True, mode='auto', period=0) 
    #callbacks_list = [checkpoints]
    #model.save(model) 
    #model.save(model_output_path)
    #model.save('sales_model.pb')
# new lines added ----------------------------
    # serialize model to JSON
   # model_json = model.to_json()
    #with open("model.json", "w") as json_file:
     #   json_file.write(model_json)
    # serialize weights to HDF5
    #model_output_path= model.save_weights("model.h5")
    #print("Saved model to disk")

    #--------------------------------------
    # Also upload model file to run outputs for history
    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)

    keras.models.save_model(model,output_path)
    print("Model saved")
    #print("printing output path:  ")
    #print(output_path)
    #checkpoints = ModelCheckpoint(output_path, verbose=1, 
     #                         save_best_only=False,
      #                        save_weights_only=True, mode='auto', period=0)


    # serialize model to JSON
    #model_json = model.to_json()
    #with open("model.json", "w") as json_file:
     #   json_file.write(model_json)
# serialize weights to HDF5
    #model.save_weights("model.h5")
    #print("Saved model to disk")
    #callbacks_list = [checkpoints]
    #model.save('output_path')
    #model.save('sales_model.pb')
    #model.save(model)   
 #   joblib.dump(value=model, filename=output_path)

    run.tag("run_type", value="train")
    print(f"tags now present for run: {run.tags}")

    run.complete()
Beispiel #30
0
def main():
    parser = argparse.ArgumentParser()
    arg = parser.add_argument
    arg('--model-name',
        type=str,
        default='diabetes_model.pkl',
        help='the name of the model')
    arg('--dataset-name',
        type=str,
        help='dataset_name is always required to get the exact version \
         of dataset wanted instead of the one during pipeline creation')
    arg('--step-output', type=str, help='output of data passing to next step')
    arg('--dataset-version', type=str, help='wanted dataset version')
    arg('--dataset-file-path', type=str, help='new dataset to register')
    args = parser.parse_args()
    arg_dict = {
        'model_name': args.model_name,
        'datset_name': args.dataset_name,
        'step_output': args.step_output,
        'dataset_version': args.dataset_version
    }
    pprint.pprint(arg_dict)

    model_name = args.model_name
    dataset_name = args.dataset_name
    step_output = args.step_output
    dataset_file_path = args.dataset_file_path
    dataset_version = args.dataset_version
    run = Run.get_context()

    with open('parameters.json') as f:
        pars = json.load(f)
    try:
        train_args = pars['training']
    except KeyError:
        print('training key is not found!')
        train_args = {}
    print(f'training params:{train_args}')
    for key, value in train_args.items():
        run.log(key, value)
        run.parent.log(key, value)

    if dataset_name:
        if dataset_file_path == 'none':
            dataset = Dataset.get_by_name(workspace=run.experiment.workspace,
                                          name=dataset_name,
                                          version=dataset_version)
        else:
            dataset = register_dataset(
                workspace=run.experiment.workspace,
                datastore_name=os.environ.get('DATASTORE_NAME'),
                dataset_name=dataset_name,
                file_path=dataset_file_path)
    else:
        raise Exception('No dataset is provided')

    run.input_datasets['training_data'] = dataset
    run.parent.tag('dataset_id', dataset.id)

    df = dataset.to_pandas_dataframe()
    data = split_data(df)
    model = train_model(data, train_args)
    metrics = get_model_metrics(model, data)
    for key, value in metrics.items():
        run.log(key, value)
        run.parent.log(key, value)

    os.makedirs(step_output, exist_ok=True)
    model_path = os.path.join(step_output, model_name)
    joblib.dump(value=model, filename=model_path)

    os.makedirs('outputs', exist_ok=True)
    output_path = os.path.join('outputs', model_name)
    joblib.dump(value=model, filename=output_path)

    run.tag('run_type', value='train')
    run.complete()
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from azureml.core.run import Run
from sklearn.externals import joblib
import os
import numpy as np
import mylib

os.makedirs('./outputs', exist_ok=True)

X, y = load_diabetes(return_X_y=True)

run = Run.get_context()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=0)
data = {"train": {"X": X_train, "y": y_train},
        "test": {"X": X_test, "y": y_test}}

# list of numbers from 0.0 to 1.0 with a 0.05 interval
alphas = mylib.get_alphas()

for alpha in alphas:
    # Use Ridge algorithm to create a regression model
    reg = Ridge(alpha=alpha)
    reg.fit(data["train"]["X"], data["train"]["y"])
Beispiel #32
0
def train(config,
          evaluate_only=False,
          outdir=".",
          detail=False,
          azureml=False):

    filename = config.model.filename
    categories_file = config.dataset.categories
    wav_directory = config.dataset.path
    batch_size = config.training.batch_size
    hidden_units = config.model.hidden_units
    architecture = config.model.architecture
    num_layers = config.model.num_layers
    use_gpu = config.training.use_gpu

    run = None

    if azureml:
        from azureml.core.run import Run
        run = Run.get_context()
        if run is None:
            print("### Run.get_context() returned None")
        else:
            print("### Running in Azure Context")

    valid_layers = [1, 2, 3]
    if num_layers not in valid_layers:
        raise Exception(
            "--num_layers can only be one of these values {}".format(
                valid_layers))

    if not os.path.isdir(outdir):
        os.makedirs(outdir)

    if not filename:
        filename = "{}{}KeywordSpotter.pt".format(architecture, hidden_units)
        config.model.filename = filename

    # load the featurized data
    if not os.path.isdir(wav_directory):
        print("### Error: please specify valid --dataset folder location: {}".
              format(wav_directory))
        sys.exit(1)

    if not categories_file:
        categories_file = os.path.join(wav_directory, "categories.txt")

    with open(categories_file, "r") as f:
        keywords = [x.strip() for x in f.readlines()]

    training_file = os.path.join(wav_directory, "training_list.npz")
    testing_file = os.path.join(wav_directory, "testing_list.npz")
    validation_file = os.path.join(wav_directory, "validation_list.npz")

    if not os.path.isfile(training_file):
        print("Missing file {}".format(training_file))
        print("Please run make_datasets.py")
        sys.exit(1)
    if not os.path.isfile(validation_file):
        print("Missing file {}".format(validation_file))
        print("Please run make_datasets.py")
        sys.exit(1)
    if not os.path.isfile(testing_file):
        print("Missing file {}".format(testing_file))
        print("Please run make_datasets.py")
        sys.exit(1)

    model = None

    device = torch.device("cpu")
    if use_gpu:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            print("### CUDA not available!!")

    print("Loading {}...".format(testing_file))
    test_data = AudioDataset(testing_file, config.dataset, keywords)

    log = None
    if not evaluate_only:
        print("Loading {}...".format(training_file))
        training_data = AudioDataset(training_file,
                                     config.dataset,
                                     keywords,
                                     training=True)

        print("Loading {}...".format(validation_file))
        validation_data = AudioDataset(validation_file, config.dataset,
                                       keywords)

        if training_data.mean is not None:
            fname = os.path.join(outdir, "mean.npy")
            print("Saving {}".format(fname))
            np.save(fname, training_data.mean)
            fname = os.path.join(outdir, "std.npy")
            print("Saving {}".format(fname))
            np.save(fname, training_data.std)

            # use the training_data mean and std variation
            test_data.mean = training_data.mean
            test_data.std = training_data.std
            validation_data.mean = training_data.mean
            validation_data.std = training_data.std

        print("Training model {}".format(filename))
        model = create_model(config.model, training_data.input_size,
                             training_data.num_keywords)
        if device.type == 'cuda':
            model.cuda()  # move the processing to GPU

        start = time.time()
        log = model.fit(training_data, validation_data, config.training,
                        config.model.sparsify, device, detail, run)
        end = time.time()

        passed, total, rate = model.evaluate(training_data, batch_size, device)
        print("Training accuracy = {:.3f} %".format(rate * 100))

        torch.save(model.state_dict(), os.path.join(outdir, filename))

    print(
        "Evaluating {} keyword spotter using {} rows of featurized test audio..."
        .format(architecture, test_data.num_rows))
    if model is None:
        msg = "Loading trained model with input size {}, hidden units {} and num keywords {}"
        print(
            msg.format(test_data.input_size, hidden_units,
                       test_data.num_keywords))
        model = create_model(config.model, test_data.input_size,
                             test_data.num_keywords)
        model.load_dict(torch.load(filename))
        if model and device.type == 'cuda':
            model.cuda()  # move the processing to GPU

    results_file = os.path.join(outdir, "results.txt")
    passed, total, rate = model.evaluate(test_data, batch_size, device,
                                         results_file)
    print("Testing accuracy = {:.3f} %".format(rate * 100))

    if not evaluate_only:
        name = os.path.splitext(filename)[0] + ".onnx"
        print("saving onnx file: {}".format(name))
        model.export(os.path.join(outdir, name), device)

        config.dataset.sample_rate = test_data.sample_rate
        config.dataset.input_size = test_data.audio_size
        config.dataset.num_filters = test_data.input_size
        config.dataset.window_size = test_data.window_size
        config.dataset.shift = test_data.shift

        logdata = {
            "accuracy_val": rate,
            "training_time": end - start,
            "log": log
        }
        d = TrainingConfig.to_dict(config)
        logdata.update(d)

        logname = os.path.join(outdir, "train_results.json")
        save_json(logdata, logname)

    return rate, log