Example #1
0
    def test_to_datetime(self):
        pdf = pd.DataFrame({
            'year': [2015, 2016],
            'month': [2, 3],
            'day': [4, 5]
        })
        kdf = ks.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ks.to_datetime(dict_from_pdf))

        self.assert_eq(pd.to_datetime(1490195805, unit='s'),
                       ks.to_datetime(1490195805, unit='s'))
        self.assert_eq(pd.to_datetime(1490195805433502912, unit='ns'),
                       ks.to_datetime(1490195805433502912, unit='ns'))

        self.assert_eq(
            pd.to_datetime([1, 2, 3],
                           unit='D',
                           origin=pd.Timestamp('1960-01-01')),
            ks.to_datetime([1, 2, 3],
                           unit='D',
                           origin=pd.Timestamp('1960-01-01')))
Example #2
0
    def test_to_datetime(self):
        pdf = pd.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5]
        })
        kdf = ks.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ks.to_datetime(dict_from_pdf))

        self.assert_eq(pd.to_datetime(1490195805, unit="s"),
                       ks.to_datetime(1490195805, unit="s"))
        self.assert_eq(
            pd.to_datetime(1490195805433502912, unit="ns"),
            ks.to_datetime(1490195805433502912, unit="ns"),
        )

        self.assert_eq(
            pd.to_datetime([1, 2, 3],
                           unit="D",
                           origin=pd.Timestamp("1960-01-01")),
            ks.to_datetime([1, 2, 3],
                           unit="D",
                           origin=pd.Timestamp("1960-01-01")),
        )
Example #3
0
    def test_to_datetime(self):
        pdf = pd.DataFrame({
            'year': [2015, 2016],
            'month': [2, 3],
            'day': [4, 5]
        })
        kdf = ks.from_pandas(pdf)
        dict_from_pdf = pdf.to_dict()

        self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf))
        self.assert_eq(pd.to_datetime(dict_from_pdf),
                       ks.to_datetime(dict_from_pdf))
Example #4
0
    def test_to_datetime(self):
        pdf = pd.DataFrame({'year': [2015, 2016],
                            'month': [2, 3],
                            'day': [4, 5]})
        kdf = koalas.from_pandas(pdf)

        self.assert_eq(pd.to_datetime(pdf), koalas.to_datetime(kdf))

        s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 100)
        ds = koalas.from_pandas(s)

        self.assert_eq(pd.to_datetime(s, infer_datetime_format=True),
                       koalas.to_datetime(ds, infer_datetime_format=True))
Example #5
0
def extract_time_features(df):
    df['timestamp'] = ks.to_datetime(df.ts ,unit='ms')
    df['hour'] = df.timestamp.dt.hour
    df['dayofweek'] = df.timestamp.dt.dayofweek
    df['year'] = df.timestamp.dt.year
    df['month'] = df.timestamp.dt.month   
    return df
Example #6
0
    def test_to_datetime(self):
        pser = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 100)
        kser = ks.from_pandas(pser)

        self.assert_eq(pd.to_datetime(pser, infer_datetime_format=True),
                       ks.to_datetime(kser, infer_datetime_format=True))
Example #7
0
# %% [markdown]
# ## Column Manipulations

# %% [markdown]
# ### Change column type

# %%
df["INSPECTION DATE"] = df["INSPECTION DATE"].astype(str)

# %% [markdown]
# ### Creating New Columns
#  Using __DataFrame.assign__, a new column can be created but it will also generate the new dataframe where the new column is attached to the previous dataframe. In the following, we convert *inspection_date* column from __str__ to __datetime__ column.

# %%
df_new = df.assign(inspection_date_dt=lambda x: ks.to_datetime(
    x["INSPECTION DATE"], format="%m/%d/%Y", errors="coerce"))
df_new.head(3)

# %%
df_new["inspection_date_dt"].head()

# %% [markdown]
# ### Filter By Datetime

# %%
(df_new.loc[df_new["inspection_date_dt"].dt.year > 2017].head())

# %%
df_new["BORO"].value_counts()

# %% [markdown]
from datetime import datetime

ks.set_option('compute.default_index_type', 'distributed')

# %% [markdown]
# ## YouGov - Wearing Mask in public

# %%
start = datetime.now()

##Chargement dataset
df = ks.read_csv("gs://dask-vs-koalas/wearing_face_mask_public.csv", sep=";")

##Transformation du dataset = 1 ligne par date/pays
format = '%Y-%m-%d %H:%M:%S'
df['DateTime'] = ks.to_datetime(df['DateTime'], format=format)
df['DateTime'] = df['DateTime'].dt.normalize()

#### 1er changement : autoriser les opérations sur 2 dataframes différents (ks.set_option('compute.ops_on_diff_frames', True)
#### ou faire un groupby sur la colonne (comportement légèrement différent de pandas car la colonne de group_by devient un index et disparait de la projection)
# df = df.sort_values('DateTime').groupby(df['DateTime']).max()
df = df.sort_values('DateTime').groupby(['DateTime'], as_index=False).max()
# df = df.set_index(pd.DatetimeIndex(df['DateTime'])).drop(['DateTime'], axis=1)
df = df.set_index('DateTime')

#### 2e changement : The method `pd.DataFrame.resample()` is not implemented yet. (en cours d'étude : https://github.com/databricks/koalas/issues/1562)
#### on est obligé de partir sur Spark directement dans ce cas ou alors de passer par pandas ...
df = df.to_pandas()
wearing_mask_in_public_data = df.resample('1D').pad()

#### Retours au dataframe Koalas
Example #9
0
def process_log_data(spark, input_data, output_data):
    """process log_data to create users, time ,songsplay table"""
    # get filepath to log data file
    log_data = 'data/*.json'

    # read log data file
    log_kdf = ks.read_json(log_data)

    # filter by actions for song plays
    df = log_kdf.filter(log_kdf.page == "NextSong")

    # extract columns for users table
    users_table = ks.sql(""" SELECT 
                           DISTINCT
                           userId,
                           firstName,
                           lastName,
                           gender,
                           level 
                           FROM {df}""")

    # write users table to parquet files
    (users_table.to_spark().write.parquet(f'{output_data}/users',
                                          mode="overwrite"))

    # create timestamp column from original timestamp column
    df['timestamp'] = ks.to_datetime(df['ts'], unit='ns')

    # create datetime column from original timestamp column
    df['datetime'] = ks.to_datetime(df['ts'])

    # extract columns to create time table
    time_table = (ks.sql("""
            SELECT
            DISTINCT
           datetime as start_time,
           extract(day from datetime) as day,
           extract(week from datetime) as week,
           extract(month from datetime) as month,
           extract(year from datetime) as year,
           extract(hour from datetime) as hour
           from {df}
                        """))

    # to enable join on table
    ks.set_option('compute.ops_on_diff_frames', True)

    # add weekday columns
    time_table['weekday'] = df.datetime.dt.weekday

    # write time table to parquet files partitioned by year and month
    (time_table.to_spark().write.partitionBy('year', 'month').parquet('time/'))

    # read in song data to use for songplays table
    song_df = ks.read_json('data/song_data/*/*/*/*.json')

    # convert ts to datetime
    log_kdf["ts"] = ks.to_datetime(log_kdf['ts'])

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = (ks.sql(""" SELECT 
                             DISTINCT
                             row_number() over (ORDER BY e.userId) songplay_id,
                             e.ts AS start_time,
                             extract(month from e.ts) as month,
                             extract(year from e.ts) as year,
                             e.userId AS user_id,
                             e.level AS level,
                             s.song_id AS song_id,
                             s.artist_id AS artist_id,
                             e.sessionId as session_id,
                             e.location AS location,
                             e.userAgent AS user_agent
                             FROM {log_kdf} as e join {song_df} as s ON
                             (e.artist = s.artist_name AND 
                             e.song = s.title AND 
                             e.length= s.duration)
                             WHERE e.page='NextSong'

             """))

    # write songplays table to parquet files partitioned by year and month
    (songplays_table.to_spark().write.partitionBy("year", "month").parquet(
        f'{output_data}/songplayes', mode="overwrite"))