예제 #1
0
    def test_random_search(self):
        spark = SparkSession \
            .builder \
            .master("local[3]") \
            .appName("Python Spark SQL basic example") \
            .getOrCreate()

        # Load training data
        df = spark.read.format("libsvm").load("./tests/sample_libsvm_data.txt").repartition(8)
        df.printSchema()

        backend = SparkBackend(spark_context=spark.sparkContext, num_workers=3)
        store = LocalStore('/tmp')


        ######## Random Search ###########
        search_space = {'lr': hp_choice([0.01, 0.001, 0.0001])}

        random_search = RandomSearch(backend, store, estimator_gen_fn, search_space, 3, 1,
                                     validation=0.25,
                                     evaluation_metric='loss',
                                     feature_columns=['features'], label_columns=['label'])
        model = random_search.fit(df)

        output_df = model.transform(df)
        output_df.select('label', 'label__output').show(n=10)

        assert True
예제 #2
0
    def test_tpe(self):
        spark = SparkSession \
            .builder \
            .master("local[3]") \
            .appName("Python Spark SQL basic example") \
            .getOrCreate()

        # Load training data
        df = spark.read.format("libsvm").load(
            "./tests/sample_libsvm_data.txt").repartition(8)
        df.printSchema()

        backend = SparkBackend(spark_context=spark.sparkContext, num_workers=3)
        store = LocalStore('/tmp')

        def estimator_gen_fn(params):
            model = tf.keras.models.Sequential()
            model.add(tf.keras.layers.Input(shape=692, name='features'))
            model.add(tf.keras.layers.Dense(100, input_dim=692))
            model.add(tf.keras.layers.Dense(1, input_dim=100))
            model.add(tf.keras.layers.Activation('sigmoid'))

            optimizer = tf.keras.optimizers.Adam(lr=params['lr'])
            loss = 'binary_crossentropy'

            keras_estimator = SparkEstimator(model=model,
                                             optimizer=optimizer,
                                             loss=loss,
                                             metrics=['acc'],
                                             batch_size=10)

            return keras_estimator

        search_space = {
            'lr': hp_choice([0.01, 0.001, 0.0001]),
            'dummy1': hp_uniform(0, 100),
            'dummy2': hp_quniform(0, 100, 1),
            'dummy3': hp_qloguniform(0, 100, 1),
        }

        hyperopt = TPESearch(backend=backend,
                             store=store,
                             estimator_gen_fn=estimator_gen_fn,
                             search_space=search_space,
                             num_models=3,
                             num_epochs=1,
                             validation=0.25,
                             evaluation_metric='loss',
                             feature_columns=['features'],
                             label_columns=['label'],
                             verbose=2)

        model = hyperopt.fit(df)
        output_df = model.transform(df)
        output_df.select('label', 'label__output').show(n=10)

        assert True
def main():
    SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL.
    H5_PRE_PROCESSED_DATA_DIR = 'file://...' # Change pre-processed data input path. Should be accessible from all Spark workers.
    OUTPUT_PATH = 'file:///...' # Change Petastorm output path. Should be accessible from all Spark workers.
    TRAIN_FRACTION = 0.7 # Fraction of train data. Remaining is validation data.
    
    ROW_GROUP_SIZE_MB = 512 # Size of Parquet row group size.
    NUM_PARTITIONS = 100 # Number of Parquet partitions for train and val data each.
    
    spark = SparkSession \
            .builder \
            .master(SPARK_MASTER_URL) \
            .appName("Deep Postures Example - Petastorm Data Generation") \
            .getOrCreate()

    input_data = []
    if H5_PRE_PROCESSED_DATA_DIR.startswith('hdfs://'):
        args = "hdfs dfs -ls "+dir_in+" | awk '{print $8}'"
        proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)

        s_output, s_err = proc.communicate()
        input_data = ['hdfs://'+ path for path in s_output.split()]
    elif H5_PRE_PROCESSED_DATA_DIR.startswith('file://'):
        for dirname in os.listdir(H5_PRE_PROCESSED_DATA_DIR):
            if not os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname).startswith('.')
            input_data.append(str(os.path.join(H5_PRE_PROCESSED_DATA_DIR, dirname)))
    else:
        raise Exception('Unsupported file system in: {}'.format(H5_PRE_PROCESSED_DATA_DIR))

    random.shuffle(input_data)
    n_train = int(len(input_data) * TRAIN_FRACTION)
    train_data = input_data[:n_train]
    val_data = input_data[n_train:]

    backend = SparkBackend(spark_context=spark.sparkContext)
    store = LocalStore(OUTPUT_PATH, train_path=os.path.join(OUTPUT_PATH, 'train_data'), val_path=os.path.join(OUTPUT_PATH, 'val_data'))
    
    schema = Unischema('schema', [
        UnischemaField('id', np.string_, (), ScalarCodec(StringType()), False),
        UnischemaField('time', np.int64, (), ScalarCodec(LongType()), False),
        UnischemaField('data', np.float32, (100, 3), NdarrayCodec(), False),
        UnischemaField('non_wear', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('sleeping', np.int32, (), ScalarCodec(IntegerType()), False),
        UnischemaField('label', np.int32, (), ScalarCodec(IntegerType()), False)
    ])

    with materialize_dataset(spark, os.path.join(output_url, 'train_data'), schema, ROW_GROUP_SIZE_MB):
        rdd=spark.sparkContext.parallelize(train_data)
        rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]})
        rdd =  rdd.map(lambda x: dict_to_spark_row(schema, x)) 
        
        df = spark.createDataFrame(rdd, schema=schema.as_spark_schema())
        df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'train_data'))


    with materialize_dataset(spark, os.path.join(output_url, 'val_data'), schema, ROW_GROUP_SIZE_MB):
        rdd=spark.sparkContext.parallelize(val_data)
        rdd = rdd.flatMap(lambda x: load_h5(x)).map(lambda item: {'id': item[0], 'time':item[1], 'data':item[2], 'non_wear':item[3], 'sleeping':item[4], 'label':item[5]})
        rdd =  rdd.map(lambda x: dict_to_spark_row(schema, x)) 
        
        df = spark.createDataFrame(rdd, schema=schema.as_spark_schema())
        df.orderBy("id","time").coalesce(NUM_PARTITIONS).write.mode('overwrite').parquet(os.path.join(output_url, 'val_data'))

if __name__ == "__main__":
    main()
예제 #4
0
test_rows = test_df.count()
print('Training: %d' % train_rows)
print('Validation: %d' % val_rows)
print('Test: %d' % test_rows)

# ================== #
# 2. MODEL Selection #
# ================== #

print('==============')
print('Model selection')
print('==============')

backend = SparkBackend(spark_context=spark.sparkContext,
                       num_workers=args.num_workers)
store = LocalStore(args.work_dir)


# Define estimator generating function.
# Input: Dictionary containing parameter values
# Output: SparkEstimator
def estimator_gen_fn(params):
    def exp_rmspe(y_true, y_pred):
        """Competition evaluation metric, expects logarithic inputs."""
        pct = tf.square((tf.exp(y_true) - tf.exp(y_pred)) / tf.exp(y_true))
        # Compute mean excluding stores with zero denominator.
        x = tf.reduce_sum(tf.where(y_true > 0.001, pct, tf.zeros_like(pct)))
        y = tf.reduce_sum(
            tf.where(y_true > 0.001, tf.ones_like(pct), tf.zeros_like(pct)))
        return tf.sqrt(x / y)
예제 #5
0
    return keras_estimator


def main():
    SPARK_MASTER_URL = 'spark://...' # Change the Spark master URL.
    DATA_STORE_PATH = 'file:///...' # Change data store path. Should be accessible from all Spark workers.
    
    spark = SparkSession \
            .builder \
            # Change the Spark Master URL
            .master(SPARK_MASTER_URL) \
            .appName("Deep Postures Example") \
            .getOrCreate()

    backend = SparkBackend(spark_context=spark.sparkContext, num_workers=1)
    store = LocalStore(DATA_STORE_PATH, train_path=os.path.join(DATA_STORE_PATH, 'train'), val_path=os.path.join(DATA_STORE_PATH, 'valid'))

    search_space = {
                        'lr': hp_choice([0.001, 0.0001]),
                        'l2_reg': hp_choice([0.001, 0.0001]),
                        'win_size': hp_choice([7, 9]),
                        'amp_factor': hp_choice([2, 4])
                }

    model_selection = GridSearch(backend, store, estimator_gen_fn, search_space, 10, evaluation_metric='loss',
                        feature_columns=['id', 'time', 'non_wear', 'sleeping', 'label', 'data'], label_columns=['label'])
    model = model_selection.fit_on_prepared_data()

if __name__ == "__main__":
    main()