def test_invalid_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = np.array(array_data)
    label_data = np.array([99, 98, 97, 1000]).astype(np.dtype('float64'))
    with tempfile.TemporaryFile() as f:
        with pytest.raises(ValueError):
            write_numpy_to_dense_tensor(f, array, label_data)
def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=None):
    """Upload the training ``array`` and ``labels`` arrays to ``num_shards`` s3 objects,
    stored in "s3://``bucket``/``key_prefix``/"."""
    shards = _build_shards(num_shards, array)
    if labels is not None:
        label_shards = _build_shards(num_shards, labels)
    uploaded_files = []
    if key_prefix[-1] != '/':
        key_prefix = key_prefix + '/'
    try:
        for shard_index, shard in enumerate(shards):
            with tempfile.TemporaryFile() as file:
                if labels is not None:
                    write_numpy_to_dense_tensor(file, shard, label_shards[shard_index])
                else:
                    write_numpy_to_dense_tensor(file, shard)
                file.seek(0)
                shard_index_string = str(shard_index).zfill(len(str(len(shards))))
                file_name = "matrix_{}.pbr".format(shard_index_string)
                key = key_prefix + file_name
                logger.debug("Creating object {} in bucket {}".format(key, bucket))
                s3.Object(bucket, key).put(Body=file)
                uploaded_files.append(file_name)
        manifest_key = key_prefix + ".amazon.manifest"
        manifest_str = json.dumps(
            [{'prefix': 's3://{}/{}'.format(bucket, key_prefix)}] + uploaded_files)
        s3.Object(bucket, manifest_key).put(Body=manifest_str.encode('utf-8'))
        return "s3://{}/{}".format(bucket, manifest_key)
    except Exception as ex:
        try:
            for file in uploaded_files:
                s3.Object(bucket, key_prefix + file).delete()
        finally:
            raise ex
def test_invalid_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = np.array(array_data)
    label_data = np.array([99, 98, 97, 1000]).astype(np.dtype("float64"))
    with tempfile.TemporaryFile() as f:
        with pytest.raises(ValueError):
            write_numpy_to_dense_tensor(f, array, label_data)
def test_int_write_numpy_to_dense_tensor():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = np.array(array_data)
    with tempfile.TemporaryFile() as f:
        write_numpy_to_dense_tensor(f, array)
        f.seek(0)
        for record_data, expected in zip(_read_recordio(f), array_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].int32_tensor.values == expected
def test_float32_write_numpy_to_dense_tensor():
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    array = np.array(array_data).astype(np.dtype('float32'))
    with tempfile.TemporaryFile() as f:
        write_numpy_to_dense_tensor(f, array)
        f.seek(0)
        for record_data, expected in zip(_read_recordio(f), array_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].float32_tensor.values == expected
def test_float32_write_numpy_to_dense_tensor():
    array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
    array = np.array(array_data).astype(np.dtype("float32"))
    with tempfile.TemporaryFile() as f:
        write_numpy_to_dense_tensor(f, array)
        f.seek(0)
        for record_data, expected in zip(read_recordio(f), array_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].float32_tensor.values == expected
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
def test_float_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = np.array(array_data)
    label_data = np.array([99, 98, 97]).astype(np.dtype('float64'))
    with tempfile.TemporaryFile() as f:
        write_numpy_to_dense_tensor(f, array, label_data)
        f.seek(0)
        for record_data, expected, label in zip(_read_recordio(f), array_data, label_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].int32_tensor.values == expected
            assert record.label["values"].float64_tensor.values == [label]
Esempio n. 9
0
def test_float32_label():
    array_data = [[1, 2, 3], [10, 20, 3]]
    array = np.array(array_data)
    label_data = np.array([99, 98, 97]).astype(np.dtype('float32'))
    with tempfile.TemporaryFile() as f:
        write_numpy_to_dense_tensor(f, array, label_data)
        f.seek(0)
        for record_data, expected, label in zip(_read_recordio(f), array_data, label_data):
            record = Record()
            record.ParseFromString(record_data)
            assert record.features["values"].int32_tensor.values == expected
            assert record.label["values"].float32_tensor.values == [label]
Esempio n. 10
0
def pre_process(filename):
    data_dir = ".\\tmp\\userData\\"
    raw_file = os.path.join(data_dir, "raw", filename)

    print("raw data from {}".format(raw_file))
    raw = np.loadtxt(raw_file, delimiter=',')

    np.random.seed(1)
    np.random.shuffle(raw)
    session['feature_size'] = int(raw.shape[1] - 1)

    train_size = int(0.9 * raw.shape[0])
    train_feat = raw[:train_size, :-1]
    train_label = raw[:train_size, -1]
    test_feat = raw[train_size:, :-1]
    test_label = raw[train_size:, -1]

    # save to an s3 bucket
    buf = io.BytesIO()
    smac.write_numpy_to_dense_tensor(buf, train_feat, train_label)
    buf.seek(0)

    bucket = "cs218project2"
    prefix = "proofOfConcept-2020-04-27"
    key = 'sessionData'

    print("uploading")
    train_path = os.path.join(prefix, 'train', key)
    boto3.resource('s3').Bucket(bucket).Object(train_path).upload_fileobj(buf)
    session['train'] = 's3://{}/{}'.format(bucket, train_path)
    print('uploaded training data location: {}'.format(session['train']))

    buf = io.BytesIO()
    smac.write_numpy_to_dense_tensor(buf, test_feat, test_label)
    buf.seek(0)

    print("uploading")
    test_path = os.path.join(prefix, 'test', key)
    boto3.resource('s3').Bucket(bucket).Object(test_path).upload_fileobj(buf)
    session['test'] = 's3://{}/{}'.format(bucket, test_path)
    print('uploaded test data location: {}'.format(session['test']))
Esempio n. 11
0
def save_train_val_to_s3(sagemaker_bucket, sm_prefix, train_X, train_y, val_X,
                         val_y):

    print('Saving training and validation data to S3')
    '''
    Now, we'll convert the datasets to the recordIO-wrapped protobuf format used by the Amazon SageMaker
    algorithms, and then upload this data to S3
    '''

    # First we'll convert the training data set
    train_file = 'linear_train.data'

    f = io.BytesIO()
    smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'),
                                     train_y.astype('float32'))
    f.seek(0)

    #    s3_train_data_loc = os.path.join(sm_prefix, 'train', train_file)
    s3_train_data_loc = '/'.join([sm_prefix, 'train', train_file])
    print('Saving training data in RecordIO format to {}'.format(
        s3_train_data_loc))
    boto3.Session().resource('s3').Bucket(sagemaker_bucket).Object(
        s3_train_data_loc).upload_fileobj(f)

    # Now we'll convert the validation data set
    validation_file = 'linear_validation.data'

    f = io.BytesIO()
    smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'),
                                     val_y.astype('float32'))
    f.seek(0)

    #    s3_validation_data_loc = os.path.join(sm_prefix, 'validation', validation_file)
    s3_validation_data_loc = '/'.join(
        [sm_prefix, 'validation', validation_file])
    print('Saving validation data in RecordIO format to {}'.format(
        s3_validation_data_loc))
    boto3.Session().resource('s3').Bucket(sagemaker_bucket).Object(
        s3_validation_data_loc).upload_fileobj(f)
def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"

    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data})

    endpoint_name = name_from_base('byo')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
Esempio n. 13
0
def clean_fn(dataset, flag, fit=True, ct=None):
    """
    Drop rows that contain a fraction of NaN above threshold thresh
    
    Parameters:
    -----------
    dataset (pandas.DataFrame) : dataframe for which the rows will be removed
    fit (Boolean) : wheter the scikit-learn transformers shall be fitted on the data
    
    Returns:
    --------
    the dataset cleaned, scaled, transformed as well as well as the fitted scikit-learn transformers
       
    """

    levels_description = pd.read_excel(
        'DIAS Attributes - Values 2017_custom.xlsx',  # Added Data Type
        header=1,
        usecols=[1, 2, 3, 4, 5, 6]).fillna(method='ffill')
    features_description = pd.read_excel(
        'DIAS Information Levels - Attributes 2017.xlsx',
        header=1,
        usecols=[1, 2, 3, 4, 5,
                 6]).fillna(method='ffill').fillna(method='bfill')

    global_info = (pd.merge(levels_description,
                            features_description,
                            how='inner',
                            on='Attribute').drop(
                                ['Additional notes', 'Description_y'], axis=1))

    # columns to drop based on information gathered from the Excel files
    to_drop = [
        'CAMEO_DEUG_2015', 'CAMEO_DEU_2015', 'LP_LEBENSPHASE_FEIN',
        'LP_LEBENSPHASE_GROB', 'LP_FAMILIE_FEIN', 'LP_FAMILIE_GROB',
        'LP_STATUS_FEIN'
    ]

    # columns to drop based on detailed analysis of the remaning features not in the Excel files
    to_drop.extend([
        "ALTER_KIND2", "ALTER_KIND3", "ALTER_KIND4", "ALTER_KIND1",
        "D19_DIGIT_SERV", "D19_BANKEN_LOKAL", "D19_VERSI_OFFLINE_DATUM",
        "D19_BANKEN_REST", "D19_VERSI_ONLINE_DATUM", "D19_GARTEN",
        "D19_TELKO_ANZ_12", "D19_BANKEN_ANZ_24", "D19_ENERGIE",
        "D19_VERSI_ANZ_12", "D19_BANKEN_ANZ_12", "D19_BANKEN_GROSS",
        "D19_BIO_OEKO", "D19_NAHRUNGSERGAENZUNG", "D19_TELKO_ANZ_24",
        "D19_TELKO_ONLINE_QUOTE_12", "D19_SAMMELARTIKEL", "D19_KOSMETIK",
        "D19_DROGERIEARTIKEL", "D19_WEIN_FEINKOST", "D19_VERSAND_REST",
        "D19_TELKO_MOBILE", "D19_TELKO_REST", "D19_VERSI_ANZ_24",
        "D19_VERSICHERUNGEN", "D19_VERSICHERUNGEN", "D19_VERSI_DATUM",
        "D19_LEBENSMITTEL", "D19_SCHUHE", "D19_VERSI_ONLINE_QUOTE_12",
        "D19_KINDERARTIKEL", "D19_HAUS_DEKO", "D19_BANKEN_DIREKT",
        "D19_BILDUNG", "D19_RATGEBER", "D19_HANDWERK", "D19_FREIZEIT",
        "ANZ_KINDER", "D19_LOTTO", "ALTERSKATEGORIE_FEIN",
        "EINGEZOGENAM_HH_JAHR", "EINGEFUEGT_AM"
    ])

    print(f'number of columns before manual droping : {dataset.shape[1]}')
    dataset = dataset.drop(to_drop, axis=1)
    print(f'number of columns before manual droping : {dataset.shape[1]}')

    # first, clean 'X' and 'XX' values that appear and replace them by NaN
    dataset = dataset.replace('X', np.nan)
    dataset = dataset.replace('XX', np.nan)

    # then process effectively
    avant_list, main_list = identify_mainstream(global_info)
    print(f'shape before processing : {dataset.shape}')
    dataset = process_specific_columns(dataset, avant_list, main_list)
    print(f'shape afer processing : {dataset.shape}')

    # make non-natural nan values consistent
    nan_info, replacements = construct_fill_na_new(global_info, dataset)
    dataset = make_replacement(dataset, replacements)

    # replace non-natural nan by np.nan
    dataset = fill_na_presc(dataset, nan_info)

    # print which columns will get removed due to too much NaN
    thresh = .65
    identify_na_columns(dataset, thresh)
    dataset = drop_na_columns(dataset, thresh)

    # drop rows due to too much NaN
    # dataset = drop_na_rows(dataset, .05)
    # not performed based on exploratory analysis of mailout training dataset

    # save cleaned data to S3
    dataset = dataset.reindex(sorted(dataset.columns), axis=1)
    dataset.to_pickle(f'{CLEANED_DATA_SAVEPTH_S3}/{flag}_cleaned_df.pkl')

    # Save index & columns since LNR will be removed for future operations
    # and scikit does not preserve indices
    with open(f"columns_{flag}_cleaned.csv", "w") as f:
        wr = csv.writer(f, delimiter="\n")
        wr.writerow(dataset.columns.values)

    with open(f"index_{flag}_cleaned.csv", "w") as f:
        wr = csv.writer(f, delimiter="\n")
        wr.writerow(dataset['LNR'].values)  # index is contained in LNR columns

    # and upload those to S3 as well
    sagemaker.s3.S3Uploader.upload(f'columns_{flag}_cleaned.csv',
                                   f'{CLEANED_DATA_SAVEPTH_S3}')

    sagemaker.s3.S3Uploader.upload(f'index_{flag}_cleaned.csv',
                                   f'{CLEANED_DATA_SAVEPTH_S3}')

    # Transform, scale, Input

    # First, pop identification column (LNR)
    dataset.drop('LNR', axis=1, inplace=True)

    # identify categorical vs numerical
    cat_columns = identify_categorical_from_analysis(global_info, dataset)
    cat_columns = list(
        set(cat_columns).union(
            list(dataset.columns[dataset.dtypes == 'object'])))

    num_columns = list(set(dataset.columns).difference(set(cat_columns)))

    print(f'total number of columns:'
          f'{dataset.shape[1]},\nnumber of categorical:{len(cat_columns)},\n'
          f'number of numerical:{len(num_columns)}')

    # define the transformation pipelines
    # define the transformation pipelines
    numeric_pipeline = make_pipeline(
        SimpleImputer(strategy='mean', missing_values=np.nan), MinMaxScaler())
    categorical_pipeline = make_pipeline(
        SimpleImputer(strategy='most_frequent', missing_values=np.nan),
        OneHotEncoder(handle_unknown='ignore'))

    ct = make_column_transformer((numeric_pipeline, num_columns),
                                 (categorical_pipeline, cat_columns))

    if fit:
        # fit_transform
        dataset_X = ct.fit_transform(dataset)
    else:
        dataset_X = ct.transform(dataset)

    # reconstructing a dataframe
    dataset = pd.DataFrame(dataset_X, columns=get_ct_feature_names(ct))
    dataset['LNR'] = pd.read_csv(
        f'{CLEANED_DATA_SAVEPTH_S3}/index_{flag}_cleaned.csv',
        header=None).values

    print(
        f'following imputing, scaling, transforming, dataset has {dataset.shape[1]} features'
    )

    # Send transformed data to S3
    dataset = dataset.reindex(sorted(dataset.columns), axis=1)
    dataset.to_pickle(
        f'{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_complete_transformed_df.pkl')

    # Send transformed data to S3 as recordIO format
    dataset_X = dataset_X.astype('float32', copy=False)

    buf = io.BytesIO()
    #write_spmatrix_to_spaase_tensor(buf, transformed_data) # produces a record IO in fact
    write_numpy_to_dense_tensor(buf, dataset_X)
    buf.seek(0)

    boto3.resource('s3').Bucket(bucket).Object(
        f'{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_array').upload_fileobj(buf)

    print(
        f'recordIO data has been saved to s3://{bucket}/{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_array'
    )

    return dataset, ct
Esempio n. 14
0
# R Squared Score
regressor.score(X, y) * 100

# In[61]:

X_train.shape

# ## Prepare for Sagemaker Setup

# In[55]:

train_file = "class_train.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_train.astype('float32'),
                                 y_train.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'train', train_file)).upload_fileobj(f)

# In[56]:

val_file = "class_val.data"

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, X_val.astype('float32'),
                                 y_val.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(
file1['Cuisine'] = file1_cuisine
features, lables = file2.iloc[:, [1, 2, 3]].astype(
    'float32'), file2.iloc[:, 4].astype('float32')
# features, lables = file2.iloc[:, [0, 1, 2, 3]], file2.iloc[:, 4].astype(int)
features = np.array(features)
labels = np.array(lables)

container = get_image_uri(boto3.Session().region_name, 'linear-learner')
role = get_execution_role()
bucket = 'sagemakercchw2'
# prefix = 'sagemaker/DEMO'
output_location = 's3://{}'.format(bucket)
print('training artifacts will be uploaded to: {}'.format(output_location))

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, features, labels)
buf.seek(0)
key = 'file2'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(
    'train', key)).upload_fileobj(buf)
s3_train_data = 's3://{}/train/{}'.format(bucket, key)
print('uploaded training data location: {}'.format(s3_train_data))

sess = sagemaker.Session()
linear = sagemaker.estimator.Estimator(container,
                                       role,
                                       train_instance_count=1,
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)