def test_to_datetime(self): pdf = pd.DataFrame({ 'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5] }) kdf = ks.from_pandas(pdf) dict_from_pdf = pdf.to_dict() self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf)) self.assert_eq(pd.to_datetime(dict_from_pdf), ks.to_datetime(dict_from_pdf)) self.assert_eq(pd.to_datetime(1490195805, unit='s'), ks.to_datetime(1490195805, unit='s')) self.assert_eq(pd.to_datetime(1490195805433502912, unit='ns'), ks.to_datetime(1490195805433502912, unit='ns')) self.assert_eq( pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')), ks.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')))
def test_to_datetime(self): pdf = pd.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) kdf = ks.from_pandas(pdf) dict_from_pdf = pdf.to_dict() self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf)) self.assert_eq(pd.to_datetime(dict_from_pdf), ks.to_datetime(dict_from_pdf)) self.assert_eq(pd.to_datetime(1490195805, unit="s"), ks.to_datetime(1490195805, unit="s")) self.assert_eq( pd.to_datetime(1490195805433502912, unit="ns"), ks.to_datetime(1490195805433502912, unit="ns"), ) self.assert_eq( pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")), ks.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")), )
def test_to_datetime(self): pdf = pd.DataFrame({ 'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5] }) kdf = ks.from_pandas(pdf) dict_from_pdf = pdf.to_dict() self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf)) self.assert_eq(pd.to_datetime(dict_from_pdf), ks.to_datetime(dict_from_pdf))
def test_to_datetime(self): pdf = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) kdf = koalas.from_pandas(pdf) self.assert_eq(pd.to_datetime(pdf), koalas.to_datetime(kdf)) s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 100) ds = koalas.from_pandas(s) self.assert_eq(pd.to_datetime(s, infer_datetime_format=True), koalas.to_datetime(ds, infer_datetime_format=True))
def extract_time_features(df): df['timestamp'] = ks.to_datetime(df.ts ,unit='ms') df['hour'] = df.timestamp.dt.hour df['dayofweek'] = df.timestamp.dt.dayofweek df['year'] = df.timestamp.dt.year df['month'] = df.timestamp.dt.month return df
def test_to_datetime(self): pser = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 100) kser = ks.from_pandas(pser) self.assert_eq(pd.to_datetime(pser, infer_datetime_format=True), ks.to_datetime(kser, infer_datetime_format=True))
# %% [markdown] # ## Column Manipulations # %% [markdown] # ### Change column type # %% df["INSPECTION DATE"] = df["INSPECTION DATE"].astype(str) # %% [markdown] # ### Creating New Columns # Using __DataFrame.assign__, a new column can be created but it will also generate the new dataframe where the new column is attached to the previous dataframe. In the following, we convert *inspection_date* column from __str__ to __datetime__ column. # %% df_new = df.assign(inspection_date_dt=lambda x: ks.to_datetime( x["INSPECTION DATE"], format="%m/%d/%Y", errors="coerce")) df_new.head(3) # %% df_new["inspection_date_dt"].head() # %% [markdown] # ### Filter By Datetime # %% (df_new.loc[df_new["inspection_date_dt"].dt.year > 2017].head()) # %% df_new["BORO"].value_counts() # %% [markdown]
from datetime import datetime ks.set_option('compute.default_index_type', 'distributed') # %% [markdown] # ## YouGov - Wearing Mask in public # %% start = datetime.now() ##Chargement dataset df = ks.read_csv("gs://dask-vs-koalas/wearing_face_mask_public.csv", sep=";") ##Transformation du dataset = 1 ligne par date/pays format = '%Y-%m-%d %H:%M:%S' df['DateTime'] = ks.to_datetime(df['DateTime'], format=format) df['DateTime'] = df['DateTime'].dt.normalize() #### 1er changement : autoriser les opérations sur 2 dataframes différents (ks.set_option('compute.ops_on_diff_frames', True) #### ou faire un groupby sur la colonne (comportement légèrement différent de pandas car la colonne de group_by devient un index et disparait de la projection) # df = df.sort_values('DateTime').groupby(df['DateTime']).max() df = df.sort_values('DateTime').groupby(['DateTime'], as_index=False).max() # df = df.set_index(pd.DatetimeIndex(df['DateTime'])).drop(['DateTime'], axis=1) df = df.set_index('DateTime') #### 2e changement : The method `pd.DataFrame.resample()` is not implemented yet. (en cours d'étude : https://github.com/databricks/koalas/issues/1562) #### on est obligé de partir sur Spark directement dans ce cas ou alors de passer par pandas ... df = df.to_pandas() wearing_mask_in_public_data = df.resample('1D').pad() #### Retours au dataframe Koalas
def process_log_data(spark, input_data, output_data): """process log_data to create users, time ,songsplay table""" # get filepath to log data file log_data = 'data/*.json' # read log data file log_kdf = ks.read_json(log_data) # filter by actions for song plays df = log_kdf.filter(log_kdf.page == "NextSong") # extract columns for users table users_table = ks.sql(""" SELECT DISTINCT userId, firstName, lastName, gender, level FROM {df}""") # write users table to parquet files (users_table.to_spark().write.parquet(f'{output_data}/users', mode="overwrite")) # create timestamp column from original timestamp column df['timestamp'] = ks.to_datetime(df['ts'], unit='ns') # create datetime column from original timestamp column df['datetime'] = ks.to_datetime(df['ts']) # extract columns to create time table time_table = (ks.sql(""" SELECT DISTINCT datetime as start_time, extract(day from datetime) as day, extract(week from datetime) as week, extract(month from datetime) as month, extract(year from datetime) as year, extract(hour from datetime) as hour from {df} """)) # to enable join on table ks.set_option('compute.ops_on_diff_frames', True) # add weekday columns time_table['weekday'] = df.datetime.dt.weekday # write time table to parquet files partitioned by year and month (time_table.to_spark().write.partitionBy('year', 'month').parquet('time/')) # read in song data to use for songplays table song_df = ks.read_json('data/song_data/*/*/*/*.json') # convert ts to datetime log_kdf["ts"] = ks.to_datetime(log_kdf['ts']) # extract columns from joined song and log datasets to create songplays table songplays_table = (ks.sql(""" SELECT DISTINCT row_number() over (ORDER BY e.userId) songplay_id, e.ts AS start_time, extract(month from e.ts) as month, extract(year from e.ts) as year, e.userId AS user_id, e.level AS level, s.song_id AS song_id, s.artist_id AS artist_id, e.sessionId as session_id, e.location AS location, e.userAgent AS user_agent FROM {log_kdf} as e join {song_df} as s ON (e.artist = s.artist_name AND e.song = s.title AND e.length= s.duration) WHERE e.page='NextSong' """)) # write songplays table to parquet files partitioned by year and month (songplays_table.to_spark().write.partitionBy("year", "month").parquet( f'{output_data}/songplayes', mode="overwrite"))