from sagemaker import LinearLearner #import the LinearLearner model

#specify an output path for your model artifacts

prefix = 'fraud'
output_path = 's3://{}/{}'.format(bucket, prefix)

"""Instantiate LinearLearner model(set predictor type to binary classifier),
set positive example weight to balanced, AWS version of smote and set training target
recall to 90%(0.9)"""

linear_balanced = LinearLearner(role=role,
                                train_instance_count=1,
                                train_instance_type='ml.c4.xlarge',
                                predictor_type='binary_classifier',
                                output_path=output_path,
                                sagemaker_session=sagemaker_session,
                                epochs=15,
                                binary_classifier_model_selection_criteria='precision_at_target_recall', # target recall
                                target_recall=0.9, #tune the model for best recall value
                                positive_example_weight_mult='balanced')

"Convert train data to AWS algorithm data format -- record_set"
# convert features/labels to numpy
train_x_np = train_features.astype('float32')
train_y_np = train_labels.astype('float32')

# create RecordSet
formatted_train_data = linear.record_set(train_x_np, labels=train_y_np)


"train the estimator on formatted training data"
def test_linearlearner_airflow_config_uploads_data_source_to_s3(
    sagemaker_session, cpu_instance_type
):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        train_set[1][:100] = 1
        train_set[1][100:200] = 0
        train_set = train_set[0], train_set[1].astype(np.dtype("float32"))

        ll = LinearLearner(
            ROLE,
            1,
            cpu_instance_type,
            predictor_type="binary_classifier",
            sagemaker_session=sagemaker_session,
        )
        ll.binary_classifier_model_selection_criteria = "accuracy"
        ll.target_recall = 0.5
        ll.target_precision = 0.5
        ll.positive_example_weight_mult = 0.1
        ll.epochs = 1
        ll.use_bias = True
        ll.num_models = 1
        ll.num_calibration_samples = 1
        ll.init_method = "uniform"
        ll.init_scale = 0.5
        ll.init_sigma = 0.2
        ll.init_bias = 5
        ll.optimizer = "adam"
        ll.loss = "logistic"
        ll.wd = 0.5
        ll.l1 = 0.5
        ll.momentum = 0.5
        ll.learning_rate = 0.1
        ll.beta_1 = 0.1
        ll.beta_2 = 0.1
        ll.use_lr_scheduler = True
        ll.lr_scheduler_step = 2
        ll.lr_scheduler_factor = 0.5
        ll.lr_scheduler_minimum_lr = 0.1
        ll.normalize_data = False
        ll.normalize_label = False
        ll.unbias_data = True
        ll.unbias_label = False
        ll.num_point_for_scaler = 10000
        ll.margin = 1.0
        ll.quantile = 0.5
        ll.loss_insensitivity = 0.1
        ll.huber_delta = 0.1
        ll.early_stopping_tolerance = 0.0001
        ll.early_stopping_patience = 3

        records = ll.record_set(train_set[0][:200], train_set[1][:200])

        training_config = _build_airflow_workflow(
            estimator=ll, instance_type=cpu_instance_type, inputs=records
        )

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"],
        )
Beispiel #3
0
# import LinearLearner
from sagemaker import LinearLearner

# instantiate LinearLearner
#Solution

# specify an output path
prefix = 'creditcard'
output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate LinearLearner
linear = LinearLearner(role=role,
                       train_instance_count=1,
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='binary_classifier',
                       output_path=output_path,
                       sagemaker_session=sagemaker_session,
                       epochs=15)

# ### EXERCISE: Convert data into a RecordSet format
#
# Next, prepare the data for a built-in model by converting the train features and labels into numpy array's of float values. Then you can use the [record_set function](https://sagemaker.readthedocs.io/en/stable/linear_learner.html#sagemaker.LinearLearner.record_set) to format the data as a RecordSet and prepare it for training!

# In[13]:

# create RecordSet of training data
#formatted_train_data = None
# convert features/labels to numpy
train_x_np = train_features.astype('float32')
train_y_np = train_labels.astype('float32')