def shuffle_set(ray_df): # Shuffle and convert ray_df = ray_df.set_index('tpep_pickup_datetime') ray_df.index = pd.to_datetime(ray_df.index) # Accesses result = ray_df.head() result = ray_df.tail()
def timeseries_set(ray_df): # Shuffle and convert ray_df = ray_df.set_index('tpep_pickup_datetime') ray_df.index = pd.to_datetime(ray_df.index) # Resample ray_resamp = ray_df.passenger_count.resample('1d') ray_mn = ray_resamp.mean() # Rolling aggregation ray_roll = ray_df.passenger_count.rolling(10) result = ray_roll.mean()
def aggregation_set(ray_df): # Convert and len ray_df.tpep_pickup_datetime = pd.to_datetime(ray_df.tpep_pickup_datetime) result = len(ray_df) # Groupby ray_groupby = ray_df.groupby(ray_df.passenger_count) # Ray does not support getting column from groupby below # result = ray_groupby.trip_distance.mean() # Filter, new column ray_df2 = ray_df[(ray_df.tip_amount > 0) & (ray_df.fare_amount > 0)] ray_df2["tip_fraction"] = ray_df2.tip_amount / ray_df2.fare_amount
import ray.dataframe as pd #import pandas as pd print('############ 12: Test Ray TimeSeries #############') ray_df = pd.read_csv("yellow_tripdata_2015-01-01.csv") print(' Read_CSV finished. Result:') print(ray_df.head(3)) ray_df = ray_df.set_index('tpep_pickup_datetime') print(' set_index finished. Result:') print(ray_df.head(3)) ray_df.index = pd.to_datetime(ray_df.index) print(' to_datetime finished. Result:') print(ray_df.head(3)) ray_resamp = ray_df.passenger_count.resample('1d') print(' resample finished') ray_mn = ray_resamp.mean() print(' mean of resample finished. Result:') print(ray_mn) ray_roll = ray_df.passenger_count.rolling(10) print(' rolling aggregation finished. Result:') print(ray_roll) result = ray_roll.mean()
import ray.dataframe as pd #import pandas as pd print('############ 3: Test Ray Convert [Column] toDateTime #############') ray_df = pd.read_csv("yellow_1of3.csv") print(' Read_CSV finished. Result:') print(ray_df.head(3)) ray_df['tpep_pickup_datetime'] = pd.to_datetime(ray_df['tpep_pickup_datetime']) print(ray_df.head(3)) print(' to_datetime(df[column]) finished. Result above.')
"!!! to_datetime(df.index): Ray may yield ValueErrors in later parts !!!" ) # If skip this, may also need to skip day_of_week/hour calculation step 09 later. Remember TimeSeries section requires DateTime conversion if runToCompletion or True: # Always skip this print( "!!! Skipping this for now to guarantee run to completion of this script !!!" ) printTimer(total_t, True, True) else: start_t = datetime.datetime.now() # Always hangs? # ray_df['tpep_pickup_datetime'] = pd.to_datetime(ray_df['tpep_pickup_datetime']) # Alternative code 1 - Dot syntax for column seems to work here. But takes 17 seconds! ray_df.tpep_pickup_datetime = pd.to_datetime( ray_df.tpep_pickup_datetime) # # Alternative code 2 - Changes index. May yield ValueErrors in later parts # ray_df = ray_df.set_index('tpep_pickup_datetime') # ray_df.index = pd.to_datetime(ray_df.index) total_t = datetime.datetime.now() - start_t result = ray_df.head(topN) printRes(result, True) printTimer(total_t, True) if runPandas: start_t = datetime.datetime.now() pd_df['tpep_pickup_datetime'] = old_pd.to_datetime( pd_df['tpep_pickup_datetime']) total_t = datetime.datetime.now() - start_t