def test_np_spark_compat_frame(self): # Use randomly generated dataFrame pdf = pd.DataFrame(np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=["a", "b"]) pdf2 = pd.DataFrame(np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=["a", "b"]) kdf = ks.from_pandas(pdf) kdf2 = ks.from_pandas(pdf2) for np_name, spark_func in unary_np_spark_mappings.items(): np_func = getattr(np, np_name) if np_name not in self.blacklist: try: # unary ufunc self.assert_eq(np_func(pdf), np_func(kdf), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e for np_name, spark_func in binary_np_spark_mappings.items(): np_func = getattr(np, np_name) if np_name not in self.blacklist: try: # binary ufunc self.assert_eq(np_func(pdf, pdf), np_func(kdf, kdf), almost=True) self.assert_eq(np_func(pdf, 1), np_func(kdf, 1), almost=True) except Exception as e: raise AssertionError("Test in '%s' function was failed." % np_name) from e # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time. try: set_option("compute.ops_on_diff_frames", True) for np_name, spark_func in list( binary_np_spark_mappings.items())[:5]: np_func = getattr(np, np_name) if np_name not in self.blacklist: try: # binary ufunc self.assert_eq( np_func(pdf, pdf2).sort_index(), np_func(kdf, kdf2).sort_index(), almost=True, ) except Exception as e: raise AssertionError( "Test in '%s' function was failed." % np_name) from e finally: reset_option("compute.ops_on_diff_frames")
def __init__( self, args, task_imls=None, input_format='dataframe', synonmys=['piano', 'rice', 'laptop'], output_pid_folder=False ): self.spark = spark_init(args.pid) if input_format == 'koalas': ks.set_option('compute.default_index_type', 'distributed') path_dict = { 'review': args.review_filename, 'product': args.product_filename, 'product_processed': args.product_processed_filename, 'ml_features_train': args.ml_features_train_filename, 'ml_features_test': args.ml_features_test_filename } self.task_imls = task_imls self.tests = PA2Test(self.spark, args.test_results_root) if output_pid_folder: output_root = os.path.join(args.output_root, args.pid) else: output_root = args.output_root self.data_io = PA2Data(self.spark, path_dict, output_root, deploy=True, input_format=input_format) self.data_dict, self.count_dict = self.data_io.load_all( input_format=input_format, no_cache=True) self.task_names = TASK_NAMES self.synonmys = synonmys
def test_get_set_reset_option(self): self.assertEqual(ks.get_option('test.config'), 'default') ks.set_option('test.config', 'value') self.assertEqual(ks.get_option('test.config'), 'value') ks.reset_option('test.config') self.assertEqual(ks.get_option('test.config'), 'default')
def test_unknown_option(self): with self.assertRaisesRegex(config.OptionError, 'No such option'): ks.get_option('unknown') with self.assertRaisesRegex(config.OptionError, "Available options"): ks.set_option('unknown', 'value') with self.assertRaisesRegex(config.OptionError, "test.config"): ks.reset_option('unknown')
def test_unknown_option(self): with self.assertRaisesRegex(config.OptionError, 'No such key'): ks.get_option('unknown') with self.assertRaisesRegex(config.OptionError, "No such key"): ks.set_option('unknown', 'value') with self.assertRaisesRegex(config.OptionError, "No such key"): ks.reset_option('unknows')
def test_get_set_reset_option_different_types(self): ks.set_option('test.config.list', [1, 2, 3, 4]) self.assertEqual(ks.get_option('test.config.list'), [1, 2, 3, 4]) ks.set_option('test.config.list', None) self.assertEqual(ks.get_option('test.config.list'), None) ks.set_option('test.config.float', None) self.assertEqual(ks.get_option('test.config.float'), None) ks.set_option('test.config.float', 5.0) self.assertEqual(ks.get_option('test.config.float'), 5.0) ks.set_option('test.config.int', 123) self.assertEqual(ks.get_option('test.config.int'), 123)
def test_different_types(self): with self.assertRaisesRegex(ValueError, "was <class 'int'>"): ks.set_option('test.config.list', 1) with self.assertRaisesRegex(ValueError, "however, expected types are"): ks.set_option('test.config.float', 'abc') with self.assertRaisesRegex(ValueError, "[<class 'int'>]"): ks.set_option('test.config.int', 'abc') with self.assertRaisesRegex(ValueError, "(<class 'int'>, <class 'NoneType'>)"): ks.set_option('test.config.int.none', 'abc')
def test_different_types(self): with self.assertRaisesRegex( TypeError, "The configuration value for 'test.config'"): ks.set_option('test.config', 1) with self.assertRaisesRegex(TypeError, "was <class 'int'>"): ks.set_option('test.config.list', 1) with self.assertRaisesRegex(TypeError, "however, <class 'float'> is expected."): ks.set_option('test.config.float', 'abc') with self.assertRaisesRegex(TypeError, "however, <class 'int'> is expected."): ks.set_option('test.config.int', 'abc')
def test_check_func(self): with self.assertRaisesRegex(ValueError, "bigger then 0"): ks.set_option('test.config.int', -1)
# To add a new cell, type '# %%' # To add a new markdown cell, type '# %% [markdown]' # %% import pandas as pd import numpy as np import databricks.koalas as ks from pyspark.sql import SparkSession from datetime import datetime ks.set_option('compute.default_index_type', 'distributed') # %% [markdown] # ## YouGov - Wearing Mask in public # %% start = datetime.now() ##Chargement dataset df = ks.read_csv("gs://dask-vs-koalas/wearing_face_mask_public.csv", sep=";") ##Transformation du dataset = 1 ligne par date/pays format = '%Y-%m-%d %H:%M:%S' df['DateTime'] = ks.to_datetime(df['DateTime'], format=format) df['DateTime'] = df['DateTime'].dt.normalize() #### 1er changement : autoriser les opérations sur 2 dataframes différents (ks.set_option('compute.ops_on_diff_frames', True) #### ou faire un groupby sur la colonne (comportement légèrement différent de pandas car la colonne de group_by devient un index et disparait de la projection) # df = df.sort_values('DateTime').groupby(df['DateTime']).max() df = df.sort_values('DateTime').groupby(['DateTime'], as_index=False).max() # df = df.set_index(pd.DatetimeIndex(df['DateTime'])).drop(['DateTime'], axis=1) df = df.set_index('DateTime')
def process_log_data(spark, input_data, output_data): """process log_data to create users, time ,songsplay table""" # get filepath to log data file log_data = 'data/*.json' # read log data file log_kdf = ks.read_json(log_data) # filter by actions for song plays df = log_kdf.filter(log_kdf.page == "NextSong") # extract columns for users table users_table = ks.sql(""" SELECT DISTINCT userId, firstName, lastName, gender, level FROM {df}""") # write users table to parquet files (users_table.to_spark().write.parquet(f'{output_data}/users', mode="overwrite")) # create timestamp column from original timestamp column df['timestamp'] = ks.to_datetime(df['ts'], unit='ns') # create datetime column from original timestamp column df['datetime'] = ks.to_datetime(df['ts']) # extract columns to create time table time_table = (ks.sql(""" SELECT DISTINCT datetime as start_time, extract(day from datetime) as day, extract(week from datetime) as week, extract(month from datetime) as month, extract(year from datetime) as year, extract(hour from datetime) as hour from {df} """)) # to enable join on table ks.set_option('compute.ops_on_diff_frames', True) # add weekday columns time_table['weekday'] = df.datetime.dt.weekday # write time table to parquet files partitioned by year and month (time_table.to_spark().write.partitionBy('year', 'month').parquet('time/')) # read in song data to use for songplays table song_df = ks.read_json('data/song_data/*/*/*/*.json') # convert ts to datetime log_kdf["ts"] = ks.to_datetime(log_kdf['ts']) # extract columns from joined song and log datasets to create songplays table songplays_table = (ks.sql(""" SELECT DISTINCT row_number() over (ORDER BY e.userId) songplay_id, e.ts AS start_time, extract(month from e.ts) as month, extract(year from e.ts) as year, e.userId AS user_id, e.level AS level, s.song_id AS song_id, s.artist_id AS artist_id, e.sessionId as session_id, e.location AS location, e.userAgent AS user_agent FROM {log_kdf} as e join {song_df} as s ON (e.artist = s.artist_name AND e.song = s.title AND e.length= s.duration) WHERE e.page='NextSong' """)) # write songplays table to parquet files partitioned by year and month (songplays_table.to_spark().write.partitionBy("year", "month").parquet( f'{output_data}/songplayes', mode="overwrite"))