Esempio n. 1
0
def main(spark: pyspark.sql.SparkSession):
    try:
        sdf = spark.readStream.format('kafka') \
            .option('kafka.bootstrap.servers', 'localhost:9092') \
            .option('subscribe', 'lcr-events') \
            .option("startingOffsets", "earliest") \
            .load()

        sdf.select(from_json(col('value').cast("string"), schema=LCR_RESULT_SCHEMA).alias('data')) \
            .select('data.*')\
            .writeStream \
            .format('console') \
            .start()

        spark.streams.awaitAnyTermination()
    finally:
        spark.stop()
Esempio n. 2
0
def test_deduplicate_no_keys(spark_session: pyspark.sql.SparkSession):
    df = spark_session.createDataFrame(
        [(1, "Account_1", 30.5), (1, "Account_1", 30.6), (1, "Account_2", 30.6), (1, "Account_1", 30.5)],
        ['id', 'account', 'score']
    )

    deduplicator = Deduplicator()
    actual_df = deduplicator.deduplicate([], df).collect()
    assert actual_df == [(1, "Account_1", 30.5), (1, "Account_1", 30.6), (1, "Account_2", 30.6)]
Esempio n. 3
0
def test_simple(spark_session: pyspark.sql.SparkSession):
    df_1 = spark_session.createDataFrame([(1, 'qwerty123'), (2, 'asdfgh123')],
                                         ['id', 'string'])

    remover = IllegalCharRemover(['1', '2', '3'], '')

    df_2 = remover.remove_illegal_chars(df_1, 'string', 'string_filtered')

    fields = [field.name for field in df_2.schema.fields]

    assert fields == ['id', 'string_filtered']
    assert df_2.collect() == [(1, 'qwerty'), (2, 'asdfgh')]
Esempio n. 4
0
def test_remove_special(spark_session: pyspark.sql.SparkSession):
    df_1 = spark_session.createDataFrame([(1, 'qwerty{\\/[]}^'),
                                          (2, 'asdfgh')], ['id', 'string'])

    remover = IllegalCharRemover(['^', '\\', '/', '[', ']', '{', '}'], '')

    df_2 = remover.remove_illegal_chars(df_1, 'string', 'string_filtered')

    fields = [field.name for field in df_2.schema.fields]

    assert fields == ['id', 'string_filtered']
    assert df_2.collect() == [(1, 'qwerty'), (2, 'asdfgh')]
Esempio n. 5
0
def iris_spark(
        iris: pd.DataFrame,
        spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:
    return spark_session.createDataFrame(iris)