def test_invalid_label(): array_data = [[1, 2, 3], [10, 20, 3]] array = np.array(array_data) label_data = np.array([99, 98, 97, 1000]).astype(np.dtype('float64')) with tempfile.TemporaryFile() as f: with pytest.raises(ValueError): write_numpy_to_dense_tensor(f, array, label_data)
def upload_numpy_to_s3_shards(num_shards, s3, bucket, key_prefix, array, labels=None): """Upload the training ``array`` and ``labels`` arrays to ``num_shards`` s3 objects, stored in "s3://``bucket``/``key_prefix``/".""" shards = _build_shards(num_shards, array) if labels is not None: label_shards = _build_shards(num_shards, labels) uploaded_files = [] if key_prefix[-1] != '/': key_prefix = key_prefix + '/' try: for shard_index, shard in enumerate(shards): with tempfile.TemporaryFile() as file: if labels is not None: write_numpy_to_dense_tensor(file, shard, label_shards[shard_index]) else: write_numpy_to_dense_tensor(file, shard) file.seek(0) shard_index_string = str(shard_index).zfill(len(str(len(shards)))) file_name = "matrix_{}.pbr".format(shard_index_string) key = key_prefix + file_name logger.debug("Creating object {} in bucket {}".format(key, bucket)) s3.Object(bucket, key).put(Body=file) uploaded_files.append(file_name) manifest_key = key_prefix + ".amazon.manifest" manifest_str = json.dumps( [{'prefix': 's3://{}/{}'.format(bucket, key_prefix)}] + uploaded_files) s3.Object(bucket, manifest_key).put(Body=manifest_str.encode('utf-8')) return "s3://{}/{}".format(bucket, manifest_key) except Exception as ex: try: for file in uploaded_files: s3.Object(bucket, key_prefix + file).delete() finally: raise ex
def test_invalid_label(): array_data = [[1, 2, 3], [10, 20, 3]] array = np.array(array_data) label_data = np.array([99, 98, 97, 1000]).astype(np.dtype("float64")) with tempfile.TemporaryFile() as f: with pytest.raises(ValueError): write_numpy_to_dense_tensor(f, array, label_data)
def test_int_write_numpy_to_dense_tensor(): array_data = [[1, 2, 3], [10, 20, 3]] array = np.array(array_data) with tempfile.TemporaryFile() as f: write_numpy_to_dense_tensor(f, array) f.seek(0) for record_data, expected in zip(_read_recordio(f), array_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].int32_tensor.values == expected
def test_float32_write_numpy_to_dense_tensor(): array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] array = np.array(array_data).astype(np.dtype('float32')) with tempfile.TemporaryFile() as f: write_numpy_to_dense_tensor(f, array) f.seek(0) for record_data, expected in zip(_read_recordio(f), array_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].float32_tensor.values == expected
def test_float32_write_numpy_to_dense_tensor(): array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] array = np.array(array_data).astype(np.dtype("float32")) with tempfile.TemporaryFile() as f: write_numpy_to_dense_tensor(f, array) f.seek(0) for record_data, expected in zip(read_recordio(f), array_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].float32_tensor.values == expected
def test_async_byo_estimator(sagemaker_session, region): image_name = registry(region) + "/factorization-machines:1" endpoint_name = name_from_base('byo') training_job_name = "" with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) # take 100 examples for faster execution vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32') labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32') buf = io.BytesIO() write_numpy_to_dense_tensor(buf, vectors, labels) buf.seek(0) bucket = sagemaker_session.default_bucket() prefix = 'test_byo_estimator' key = 'recordio-pb-data' boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf) s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key) estimator = Estimator(image_name=image_name, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-byo') estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type='binary_classifier') # training labels must be 'float32' estimator.fit({'train': s3_train_data}, wait=False) training_job_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = 'application/json' predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result['predictions']) == 10 for prediction in result['predictions']: assert prediction['score'] is not None assert estimator.train_image() == image_name
def test_float_label(): array_data = [[1, 2, 3], [10, 20, 3]] array = np.array(array_data) label_data = np.array([99, 98, 97]).astype(np.dtype('float64')) with tempfile.TemporaryFile() as f: write_numpy_to_dense_tensor(f, array, label_data) f.seek(0) for record_data, expected, label in zip(_read_recordio(f), array_data, label_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].int32_tensor.values == expected assert record.label["values"].float64_tensor.values == [label]
def test_float32_label(): array_data = [[1, 2, 3], [10, 20, 3]] array = np.array(array_data) label_data = np.array([99, 98, 97]).astype(np.dtype('float32')) with tempfile.TemporaryFile() as f: write_numpy_to_dense_tensor(f, array, label_data) f.seek(0) for record_data, expected, label in zip(_read_recordio(f), array_data, label_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].int32_tensor.values == expected assert record.label["values"].float32_tensor.values == [label]
def pre_process(filename): data_dir = ".\\tmp\\userData\\" raw_file = os.path.join(data_dir, "raw", filename) print("raw data from {}".format(raw_file)) raw = np.loadtxt(raw_file, delimiter=',') np.random.seed(1) np.random.shuffle(raw) session['feature_size'] = int(raw.shape[1] - 1) train_size = int(0.9 * raw.shape[0]) train_feat = raw[:train_size, :-1] train_label = raw[:train_size, -1] test_feat = raw[train_size:, :-1] test_label = raw[train_size:, -1] # save to an s3 bucket buf = io.BytesIO() smac.write_numpy_to_dense_tensor(buf, train_feat, train_label) buf.seek(0) bucket = "cs218project2" prefix = "proofOfConcept-2020-04-27" key = 'sessionData' print("uploading") train_path = os.path.join(prefix, 'train', key) boto3.resource('s3').Bucket(bucket).Object(train_path).upload_fileobj(buf) session['train'] = 's3://{}/{}'.format(bucket, train_path) print('uploaded training data location: {}'.format(session['train'])) buf = io.BytesIO() smac.write_numpy_to_dense_tensor(buf, test_feat, test_label) buf.seek(0) print("uploading") test_path = os.path.join(prefix, 'test', key) boto3.resource('s3').Bucket(bucket).Object(test_path).upload_fileobj(buf) session['test'] = 's3://{}/{}'.format(bucket, test_path) print('uploaded test data location: {}'.format(session['test']))
def save_train_val_to_s3(sagemaker_bucket, sm_prefix, train_X, train_y, val_X, val_y): print('Saving training and validation data to S3') ''' Now, we'll convert the datasets to the recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms, and then upload this data to S3 ''' # First we'll convert the training data set train_file = 'linear_train.data' f = io.BytesIO() smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32')) f.seek(0) # s3_train_data_loc = os.path.join(sm_prefix, 'train', train_file) s3_train_data_loc = '/'.join([sm_prefix, 'train', train_file]) print('Saving training data in RecordIO format to {}'.format( s3_train_data_loc)) boto3.Session().resource('s3').Bucket(sagemaker_bucket).Object( s3_train_data_loc).upload_fileobj(f) # Now we'll convert the validation data set validation_file = 'linear_validation.data' f = io.BytesIO() smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32')) f.seek(0) # s3_validation_data_loc = os.path.join(sm_prefix, 'validation', validation_file) s3_validation_data_loc = '/'.join( [sm_prefix, 'validation', validation_file]) print('Saving validation data in RecordIO format to {}'.format( s3_validation_data_loc)) boto3.Session().resource('s3').Bucket(sagemaker_bucket).Object( s3_validation_data_loc).upload_fileobj(f)
def test_byo_estimator(sagemaker_session, region): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the format that the algorithm can process and upload it to S3. Then we create the Estimator and set hyperparamets as required by the algorithm. Next, we can call fit() with path to the S3. Later the trained model is deployed and prediction is called against the endpoint. Default predictor is updated with json serializer and deserializer. """ image_name = registry(region) + "/factorization-machines:1" with timeout(minutes=15): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) # take 100 examples for faster execution vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32') labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32') buf = io.BytesIO() write_numpy_to_dense_tensor(buf, vectors, labels) buf.seek(0) bucket = sagemaker_session.default_bucket() prefix = 'test_byo_estimator' key = 'recordio-pb-data' boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf) s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key) estimator = Estimator(image_name=image_name, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-byo') estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type='binary_classifier') # training labels must be 'float32' estimator.fit({'train': s3_train_data}) endpoint_name = name_from_base('byo') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = estimator.create_model() predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = 'application/json' predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result['predictions']) == 10 for prediction in result['predictions']: assert prediction['score'] is not None
def clean_fn(dataset, flag, fit=True, ct=None): """ Drop rows that contain a fraction of NaN above threshold thresh Parameters: ----------- dataset (pandas.DataFrame) : dataframe for which the rows will be removed fit (Boolean) : wheter the scikit-learn transformers shall be fitted on the data Returns: -------- the dataset cleaned, scaled, transformed as well as well as the fitted scikit-learn transformers """ levels_description = pd.read_excel( 'DIAS Attributes - Values 2017_custom.xlsx', # Added Data Type header=1, usecols=[1, 2, 3, 4, 5, 6]).fillna(method='ffill') features_description = pd.read_excel( 'DIAS Information Levels - Attributes 2017.xlsx', header=1, usecols=[1, 2, 3, 4, 5, 6]).fillna(method='ffill').fillna(method='bfill') global_info = (pd.merge(levels_description, features_description, how='inner', on='Attribute').drop( ['Additional notes', 'Description_y'], axis=1)) # columns to drop based on information gathered from the Excel files to_drop = [ 'CAMEO_DEUG_2015', 'CAMEO_DEU_2015', 'LP_LEBENSPHASE_FEIN', 'LP_LEBENSPHASE_GROB', 'LP_FAMILIE_FEIN', 'LP_FAMILIE_GROB', 'LP_STATUS_FEIN' ] # columns to drop based on detailed analysis of the remaning features not in the Excel files to_drop.extend([ "ALTER_KIND2", "ALTER_KIND3", "ALTER_KIND4", "ALTER_KIND1", "D19_DIGIT_SERV", "D19_BANKEN_LOKAL", "D19_VERSI_OFFLINE_DATUM", "D19_BANKEN_REST", "D19_VERSI_ONLINE_DATUM", "D19_GARTEN", "D19_TELKO_ANZ_12", "D19_BANKEN_ANZ_24", "D19_ENERGIE", "D19_VERSI_ANZ_12", "D19_BANKEN_ANZ_12", "D19_BANKEN_GROSS", "D19_BIO_OEKO", "D19_NAHRUNGSERGAENZUNG", "D19_TELKO_ANZ_24", "D19_TELKO_ONLINE_QUOTE_12", "D19_SAMMELARTIKEL", "D19_KOSMETIK", "D19_DROGERIEARTIKEL", "D19_WEIN_FEINKOST", "D19_VERSAND_REST", "D19_TELKO_MOBILE", "D19_TELKO_REST", "D19_VERSI_ANZ_24", "D19_VERSICHERUNGEN", "D19_VERSICHERUNGEN", "D19_VERSI_DATUM", "D19_LEBENSMITTEL", "D19_SCHUHE", "D19_VERSI_ONLINE_QUOTE_12", "D19_KINDERARTIKEL", "D19_HAUS_DEKO", "D19_BANKEN_DIREKT", "D19_BILDUNG", "D19_RATGEBER", "D19_HANDWERK", "D19_FREIZEIT", "ANZ_KINDER", "D19_LOTTO", "ALTERSKATEGORIE_FEIN", "EINGEZOGENAM_HH_JAHR", "EINGEFUEGT_AM" ]) print(f'number of columns before manual droping : {dataset.shape[1]}') dataset = dataset.drop(to_drop, axis=1) print(f'number of columns before manual droping : {dataset.shape[1]}') # first, clean 'X' and 'XX' values that appear and replace them by NaN dataset = dataset.replace('X', np.nan) dataset = dataset.replace('XX', np.nan) # then process effectively avant_list, main_list = identify_mainstream(global_info) print(f'shape before processing : {dataset.shape}') dataset = process_specific_columns(dataset, avant_list, main_list) print(f'shape afer processing : {dataset.shape}') # make non-natural nan values consistent nan_info, replacements = construct_fill_na_new(global_info, dataset) dataset = make_replacement(dataset, replacements) # replace non-natural nan by np.nan dataset = fill_na_presc(dataset, nan_info) # print which columns will get removed due to too much NaN thresh = .65 identify_na_columns(dataset, thresh) dataset = drop_na_columns(dataset, thresh) # drop rows due to too much NaN # dataset = drop_na_rows(dataset, .05) # not performed based on exploratory analysis of mailout training dataset # save cleaned data to S3 dataset = dataset.reindex(sorted(dataset.columns), axis=1) dataset.to_pickle(f'{CLEANED_DATA_SAVEPTH_S3}/{flag}_cleaned_df.pkl') # Save index & columns since LNR will be removed for future operations # and scikit does not preserve indices with open(f"columns_{flag}_cleaned.csv", "w") as f: wr = csv.writer(f, delimiter="\n") wr.writerow(dataset.columns.values) with open(f"index_{flag}_cleaned.csv", "w") as f: wr = csv.writer(f, delimiter="\n") wr.writerow(dataset['LNR'].values) # index is contained in LNR columns # and upload those to S3 as well sagemaker.s3.S3Uploader.upload(f'columns_{flag}_cleaned.csv', f'{CLEANED_DATA_SAVEPTH_S3}') sagemaker.s3.S3Uploader.upload(f'index_{flag}_cleaned.csv', f'{CLEANED_DATA_SAVEPTH_S3}') # Transform, scale, Input # First, pop identification column (LNR) dataset.drop('LNR', axis=1, inplace=True) # identify categorical vs numerical cat_columns = identify_categorical_from_analysis(global_info, dataset) cat_columns = list( set(cat_columns).union( list(dataset.columns[dataset.dtypes == 'object']))) num_columns = list(set(dataset.columns).difference(set(cat_columns))) print(f'total number of columns:' f'{dataset.shape[1]},\nnumber of categorical:{len(cat_columns)},\n' f'number of numerical:{len(num_columns)}') # define the transformation pipelines # define the transformation pipelines numeric_pipeline = make_pipeline( SimpleImputer(strategy='mean', missing_values=np.nan), MinMaxScaler()) categorical_pipeline = make_pipeline( SimpleImputer(strategy='most_frequent', missing_values=np.nan), OneHotEncoder(handle_unknown='ignore')) ct = make_column_transformer((numeric_pipeline, num_columns), (categorical_pipeline, cat_columns)) if fit: # fit_transform dataset_X = ct.fit_transform(dataset) else: dataset_X = ct.transform(dataset) # reconstructing a dataframe dataset = pd.DataFrame(dataset_X, columns=get_ct_feature_names(ct)) dataset['LNR'] = pd.read_csv( f'{CLEANED_DATA_SAVEPTH_S3}/index_{flag}_cleaned.csv', header=None).values print( f'following imputing, scaling, transforming, dataset has {dataset.shape[1]} features' ) # Send transformed data to S3 dataset = dataset.reindex(sorted(dataset.columns), axis=1) dataset.to_pickle( f'{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_complete_transformed_df.pkl') # Send transformed data to S3 as recordIO format dataset_X = dataset_X.astype('float32', copy=False) buf = io.BytesIO() #write_spmatrix_to_spaase_tensor(buf, transformed_data) # produces a record IO in fact write_numpy_to_dense_tensor(buf, dataset_X) buf.seek(0) boto3.resource('s3').Bucket(bucket).Object( f'{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_array').upload_fileobj(buf) print( f'recordIO data has been saved to s3://{bucket}/{TRANSFORMED_DATA_SAVEPTH_S3}/{flag}_array' ) return dataset, ct
# R Squared Score regressor.score(X, y) * 100 # In[61]: X_train.shape # ## Prepare for Sagemaker Setup # In[55]: train_file = "class_train.data" f = io.BytesIO() smac.write_numpy_to_dense_tensor(f, X_train.astype('float32'), y_train.astype('float32')) f.seek(0) boto3.Session().resource('s3').Bucket(bucket).Object( os.path.join(prefix, 'train', train_file)).upload_fileobj(f) # In[56]: val_file = "class_val.data" f = io.BytesIO() smac.write_numpy_to_dense_tensor(f, X_val.astype('float32'), y_val.astype('float32')) f.seek(0) boto3.Session().resource('s3').Bucket(bucket).Object(
file1['Cuisine'] = file1_cuisine features, lables = file2.iloc[:, [1, 2, 3]].astype( 'float32'), file2.iloc[:, 4].astype('float32') # features, lables = file2.iloc[:, [0, 1, 2, 3]], file2.iloc[:, 4].astype(int) features = np.array(features) labels = np.array(lables) container = get_image_uri(boto3.Session().region_name, 'linear-learner') role = get_execution_role() bucket = 'sagemakercchw2' # prefix = 'sagemaker/DEMO' output_location = 's3://{}'.format(bucket) print('training artifacts will be uploaded to: {}'.format(output_location)) buf = io.BytesIO() smac.write_numpy_to_dense_tensor(buf, features, labels) buf.seek(0) key = 'file2' boto3.resource('s3').Bucket(bucket).Object(os.path.join( 'train', key)).upload_fileobj(buf) s3_train_data = 's3://{}/train/{}'.format(bucket, key) print('uploaded training data location: {}'.format(s3_train_data)) sess = sagemaker.Session() linear = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type='ml.m4.xlarge', output_path=output_location, sagemaker_session=sess)