Beispiel #1
0
def test_remove_special(spark_session: pyspark.sql.SparkSession):
    df_1 = spark_session.createDataFrame([(1, 'qwerty{\\/[]}^'),
                                          (2, 'asdfgh')], ['id', 'string'])

    remover = IllegalCharRemover(['^', '\\', '/', '[', ']', '{', '}'], '')

    df_2 = remover.remove_illegal_chars(df_1, 'string', 'string_filtered')

    fields = [field.name for field in df_2.schema.fields]

    assert fields == ['id', 'string_filtered']
    assert df_2.collect() == [(1, 'qwerty'), (2, 'asdfgh')]
Beispiel #2
0
def test_simple(spark_session: pyspark.sql.SparkSession):
    df_1 = spark_session.createDataFrame([(1, 'qwerty123'), (2, 'asdfgh123')],
                                         ['id', 'string'])

    remover = IllegalCharRemover(['1', '2', '3'], '')

    df_2 = remover.remove_illegal_chars(df_1, 'string', 'string_filtered')

    fields = [field.name for field in df_2.schema.fields]

    assert fields == ['id', 'string_filtered']
    assert df_2.collect() == [(1, 'qwerty'), (2, 'asdfgh')]
Beispiel #3
0
def iris_spark(
        iris: pd.DataFrame,
        spark_session: pyspark.sql.SparkSession) -> pyspark.sql.DataFrame:
    return spark_session.createDataFrame(iris)