def shuffle_set(ray_df):
	# Shuffle and convert
	ray_df = ray_df.set_index('tpep_pickup_datetime') 
	ray_df.index = pd.to_datetime(ray_df.index)

	# Accesses
	result = ray_df.head()
	result = ray_df.tail()
def timeseries_set(ray_df):
	# Shuffle and convert
	ray_df = ray_df.set_index('tpep_pickup_datetime') 
	ray_df.index = pd.to_datetime(ray_df.index)

	# Resample
	ray_resamp = ray_df.passenger_count.resample('1d')
	ray_mn = ray_resamp.mean()

	# Rolling aggregation
	ray_roll = ray_df.passenger_count.rolling(10)
	result = ray_roll.mean()
def aggregation_set(ray_df):
	# Convert and len
	ray_df.tpep_pickup_datetime = pd.to_datetime(ray_df.tpep_pickup_datetime) 
	result = len(ray_df)

	# Groupby
	ray_groupby = ray_df.groupby(ray_df.passenger_count)
	# Ray does not support getting column from groupby below
	# result = ray_groupby.trip_distance.mean()

	# Filter, new column
	ray_df2 = ray_df[(ray_df.tip_amount > 0) & (ray_df.fare_amount > 0)]
	ray_df2["tip_fraction"] = ray_df2.tip_amount / ray_df2.fare_amount
import ray.dataframe as pd
#import pandas as pd

print('############ 12: Test Ray TimeSeries #############')

ray_df = pd.read_csv("yellow_tripdata_2015-01-01.csv")

print('    Read_CSV finished. Result:')
print(ray_df.head(3))

ray_df = ray_df.set_index('tpep_pickup_datetime') 

print('    set_index finished. Result:')
print(ray_df.head(3))

ray_df.index = pd.to_datetime(ray_df.index)
print('    to_datetime finished. Result:')
print(ray_df.head(3))

ray_resamp = ray_df.passenger_count.resample('1d')
print('    resample finished')

ray_mn = ray_resamp.mean()
print('    mean of resample finished. Result:')
print(ray_mn)

ray_roll = ray_df.passenger_count.rolling(10)
print('    rolling aggregation finished. Result:')
print(ray_roll)

result = ray_roll.mean()
Exemple #5
0
import ray.dataframe as pd
#import pandas as pd

print('############ 3: Test Ray Convert [Column] toDateTime #############')

ray_df = pd.read_csv("yellow_1of3.csv")

print('    Read_CSV finished. Result:')
print(ray_df.head(3))

ray_df['tpep_pickup_datetime'] = pd.to_datetime(ray_df['tpep_pickup_datetime']) 
print(ray_df.head(3))

print('    to_datetime(df[column]) finished. Result above.')
Exemple #6
0
    "!!!     to_datetime(df.index): Ray may yield ValueErrors in later parts !!!"
)
# If skip this, may also need to skip day_of_week/hour calculation step 09 later. Remember TimeSeries section requires DateTime conversion
if runToCompletion or True:  # Always skip this
    print(
        "!!!     Skipping this for now to guarantee run to completion of this script !!!"
    )
    printTimer(total_t, True, True)
else:
    start_t = datetime.datetime.now()

    # Always hangs?
    # ray_df['tpep_pickup_datetime'] = pd.to_datetime(ray_df['tpep_pickup_datetime'])

    # Alternative code 1 - Dot syntax for column seems to work here. But takes 17 seconds!
    ray_df.tpep_pickup_datetime = pd.to_datetime(
        ray_df.tpep_pickup_datetime)  #

    # Alternative code 2 - Changes index. May yield ValueErrors in later parts
    # ray_df = ray_df.set_index('tpep_pickup_datetime')
    # ray_df.index = pd.to_datetime(ray_df.index)

    total_t = datetime.datetime.now() - start_t
    result = ray_df.head(topN)
    printRes(result, True)
    printTimer(total_t, True)

if runPandas:
    start_t = datetime.datetime.now()
    pd_df['tpep_pickup_datetime'] = old_pd.to_datetime(
        pd_df['tpep_pickup_datetime'])
    total_t = datetime.datetime.now() - start_t