def test_random_search(self): spark = SparkSession \ .builder \ .master("local[3]") \ .appName("Python Spark SQL basic example") \ .getOrCreate() # Load training data df = spark.read.format("libsvm").load("./tests/sample_libsvm_data.txt").repartition(8) df.printSchema() backend = SparkBackend(spark_context=spark.sparkContext, num_workers=3) store = LocalStore('/tmp') ######## Random Search ########### search_space = {'lr': hp_choice([0.01, 0.001, 0.0001])} random_search = RandomSearch(backend, store, estimator_gen_fn, search_space, 3, 1, validation=0.25, evaluation_metric='loss', feature_columns=['features'], label_columns=['label']) model = random_search.fit(df) output_df = model.transform(df) output_df.select('label', 'label__output').show(n=10) assert True
def test_tpe(self): spark = SparkSession \ .builder \ .master("local[3]") \ .appName("Python Spark SQL basic example") \ .getOrCreate() # Load training data df = spark.read.format("libsvm").load( "./tests/sample_libsvm_data.txt").repartition(8) df.printSchema() backend = SparkBackend(spark_context=spark.sparkContext, num_workers=3) store = LocalStore('/tmp') def estimator_gen_fn(params): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Input(shape=692, name='features')) model.add(tf.keras.layers.Dense(100, input_dim=692)) model.add(tf.keras.layers.Dense(1, input_dim=100)) model.add(tf.keras.layers.Activation('sigmoid')) optimizer = tf.keras.optimizers.Adam(lr=params['lr']) loss = 'binary_crossentropy' keras_estimator = SparkEstimator(model=model, optimizer=optimizer, loss=loss, metrics=['acc'], batch_size=10) return keras_estimator search_space = { 'lr': hp_choice([0.01, 0.001, 0.0001]), 'dummy1': hp_uniform(0, 100), 'dummy2': hp_quniform(0, 100, 1), 'dummy3': hp_qloguniform(0, 100, 1), } hyperopt = TPESearch(backend=backend, store=store, estimator_gen_fn=estimator_gen_fn, search_space=search_space, num_models=3, num_epochs=1, validation=0.25, evaluation_metric='loss', feature_columns=['features'], label_columns=['label'], verbose=2) model = hyperopt.fit(df) output_df = model.transform(df) output_df.select('label', 'label__output').show(n=10) assert True
def main(): SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL. H5_PRE_PROCESSED_DATA_DIR = 'file://...' # Change pre-processed data input path. Should be accessible from all Spark workers. OUTPUT_PATH = 'file:///...' # Change Petastorm output path. Should be accessible from all Spark workers. TRAIN_FRACTION = 0.7 # Fraction of train data. Remaining is validation data. ROW_GROUP_SIZE_MB = 512 # Size of Parquet row group size. NUM_PARTITIONS = 100 # Number of Parquet partitions for train and val data each. spark = SparkSession \ .builder \ .master(SPARK_MASTER_URL) \ .appName("Deep Postures Example - Petastorm Data Generation") \ .getOrCreate() input_data = [] if H5_PRE_PROCESSED_DATA_DIR.startswith('hdfs://'): args = "hdfs dfs -ls "+dir_in+" | awk '{print $8}'" proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) s_output, s_err = proc.communicate() input_data = ['hdfs://'+ path for path in s_output.split()] elif H5_PRE_PROCESSED_DATA_DIR.startswith('file://'): for dirname in os.listdir(H5_PRE_PROCESSED_DATA_DIR): if not os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname).startswith('.') input_data.append(str(os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname))) else: raise Exception('Unsupported file system in: {}'.format(H5_PRE_PROCESSED_DATA_DIR)) random.shuffle(input_data) n_train = int(len(input_data) * TRAIN_FRACTION) train_data = input_data[:n_train] val_data = input_data[n_train:] backend = SparkBackend(spark_context=spark.sparkContext) store = LocalStore(OUTPUT_PATH, train_path=os.path.join(OUTPUT_PATH, 'train_data'), val_path=os.path.join(OUTPUT_PATH, 'val_data')) schema = Unischema('schema', [ UnischemaField('id', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('time', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('data', np.float32, (100, 3), NdarrayCodec(), False), UnischemaField('non_wear', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('sleeping', np.int32, (), ScalarCodec(IntegerType()), False), UnischemaField('label', np.int32, (), ScalarCodec(IntegerType()), False) ]) with materialize_dataset(spark, os.path.join(output_url, 'train_data'), schema, ROW_GROUP_SIZE_MB): rdd=spark.sparkContext.parallelize(train_data) rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]}) rdd = rdd.map(lambda x: dict_to_spark_row(schema, x)) df = spark.createDataFrame(rdd, schema=schema.as_spark_schema()) df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'train_data')) with materialize_dataset(spark, os.path.join(output_url, 'val_data'), schema, ROW_GROUP_SIZE_MB): rdd=spark.sparkContext.parallelize(val_data) rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]}) rdd = rdd.map(lambda x: dict_to_spark_row(schema, x)) df = spark.createDataFrame(rdd, schema=schema.as_spark_schema()) df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'val_data')) if __name__ == "__main__": main()
test_rows = test_df.count() print('Training: %d' % train_rows) print('Validation: %d' % val_rows) print('Test: %d' % test_rows) # ================== # # 2. MODEL Selection # # ================== # print('==============') print('Model selection') print('==============') backend = SparkBackend(spark_context=spark.sparkContext, num_workers=args.num_workers) store = LocalStore(args.work_dir) # Define estimator generating function. # Input: Dictionary containing parameter values # Output: SparkEstimator def estimator_gen_fn(params): def exp_rmspe(y_true, y_pred): """Competition evaluation metric, expects logarithic inputs.""" pct = tf.square((tf.exp(y_true) - tf.exp(y_pred)) / tf.exp(y_true)) # Compute mean excluding stores with zero denominator. x = tf.reduce_sum(tf.where(y_true > 0.001, pct, tf.zeros_like(pct))) y = tf.reduce_sum( tf.where(y_true > 0.001, tf.ones_like(pct), tf.zeros_like(pct))) return tf.sqrt(x / y)
return keras_estimator def main(): SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL. DATA_STORE_PATH = 'file:///...' # Change data store path. Should be accessible from all Spark workers. spark = SparkSession \ .builder \ # Change the Spark Master URL .master(SPARK_MASTER_URL) \ .appName("Deep Postures Example") \ .getOrCreate() backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1) store = LocalStore(DATA_STORE_PATH, train_path=os.path.join(DATA_STORE_PATH, 'train'), val_path=os.path.join(DATA_STORE_PATH, 'valid')) search_space = { 'lr': hp_choice([0.001, 0.0001]), 'l2_reg': hp_choice([0.001, 0.0001]), 'win_size': hp_choice([7, 9]), 'amp_factor': hp_choice([2, 4]) } model_selection = GridSearch(backend, store, estimator_gen_fn, search_space, 10, evaluation_metric='loss', feature_columns=['id', 'time', 'non_wear', 'sleeping', 'label', 'data'], label_columns=['label']) model = model_selection.fit_on_prepared_data() if __name__ == "__main__": main()