def test_keep_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.keep_col(['city', 'population']) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_drop_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.drop_col("country") assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_to_csv(spark_session): try: transformer = op.DataFrameTransformer(create_other_df(spark_session)) transformer.to_csv("test.csv") assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_string_to_index(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.string_to_index(["city", "country"]) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_iloc(spark_session): transformer = op.DataFrameTransformer(create_select_df(spark_session)) actual_df = transformer.iloc([0]).df expected_df = create_select_sample_df(spark_session) assert (expected_df.collect() == actual_df.collect())
def test_transformer(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) assert isinstance(transformer.get_data_frame, pyspark.sql.dataframe.DataFrame) except RuntimeError: logger.exception('Could not create transformer.') sys.exit(1)
def test_normalizer(spark_session): try: transformer = op.DataFrameTransformer(create_vector_df(spark_session)) transformer.normalizer(["features"]) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_lookup(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.lookup('city', "Caracas", ['Caracas', 'Ccs']) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_move_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.move_col('city', 'country', position='after') assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_remove_special_chars(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.remove_special_chars(columns=['city', 'country']) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_clear_accents(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.clear_accents(columns='*') assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def gbt(df, columns, input_col): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ assert_spark_df(df) assert isinstance(columns, list), "Error, columns must be a list" assert isinstance(input_col, str), "Error, input column must be a string" data = df.select(columns) feats = data.columns feats.remove(input_col) transformer = op.DataFrameTransformer(data) transformer.string_to_index(input_cols=input_col) transformer.vector_assembler(input_cols=feats) model = GBTClassifier() transformer.rename_col(columns=[(input_col + "_index", "label")]) gbt_model = model.fit(transformer.df) df_model = gbt_model.transform(transformer.df) return df_model, gbt_model
def test_one_hot_encoder(spark_session): try: transformer = op.DataFrameTransformer(create_sql_df(spark_session)) transformer.one_hot_encoder(["id"]) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_to_csv(spark_session): try: transformer = op.DataFrameTransformer(create_other_df(spark_session)) transformer.to_csv("test.csv") assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run to_csv().') sys.exit(1)
def test_collect(spark_session): transformer = op.DataFrameTransformer( create_select_sample_df(spark_session)) actual = transformer.collect() expected = create_select_sample_df(spark_session).collect() assert (actual == expected)
def test_normalize(): df = load_data_311("aws") starttime = time.time() transformer = op.DataFrameTransformer(df) transformer.normalizer(columns) transformer.df.count() print("The optimus normalize() takes: " + str(time.time() - starttime) + " sec.")
def test_sql(spark_session): try: transformer = op.DataFrameTransformer(create_sql_df(spark_session)) transformer.sql( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_set_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) func = lambda cell: (cell * 2) if (cell > 14000000) else cell transformer.set_col(['population'], func, 'integer') assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_remove_special_chars(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.remove_special_chars(columns=['city', 'country']) assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run remove_special_chars().') sys.exit(1)
def test_rename_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) names = [('city', 'villes')] transformer.rename_col(names) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_replace_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.replace_col(search='Tokyo', change_to='Maracaibo', columns='city') assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run replace_col().') sys.exit(1)
def test_delete_row(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) func = lambda pop: (pop > 6500000) & (pop <= 30000000) transformer.delete_row(func(col('population'))) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_clear_accents(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.clear_accents(columns='*') assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run clear_accents().') sys.exit(1)
def test_assembler(spark_session): try: transformer = op.DataFrameTransformer( create_assembler_df(spark_session)) transformer.vector_assembler(["hour", "mobile", "userFeatures"]) assert_spark_df(transformer.df) except RuntimeError: sys.exit(1)
def test_keep_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.keep_col(['city', 'population']) assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run keep_col().') sys.exit(1)
def test_min_max_scale(): df = load_data_311("aws") starttime = time.time() transformer = op.DataFrameTransformer(df) transformer.scale_vec_col(columns, 'scaled') transformer.df.count() print("The optimus min_max_scale() takes: " + str(time.time() - starttime) + " sec.")
def test_drop_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.drop_col("country") assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run drop_col().') sys.exit(1)
def test_lookup(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.lookup('city', "Caracas", ['Caracas', 'Ccs']) assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run lookup().') sys.exit(1)
def test_move_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) transformer.move_col('city', 'country', position='after') assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run move_col().') sys.exit(1)
def test_rename_col(spark_session): try: transformer = op.DataFrameTransformer(create_df(spark_session)) names = [('city', 'villes')] transformer.rename_col(names) assert_spark_df(transformer.get_data_frame) except RuntimeError: logger.exception('Could not run rename_col().') sys.exit(1)