Exemple #1
0
def test_to_datetime_errors(data):
    pd_data = data
    if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
        gd_data = cudf.from_pandas(pd_data)
    else:
        gd_data = pd_data

    try:
        pd.to_datetime(pd_data)
    except Exception as e:
        with pytest.raises(type(e), match=re.escape(str(e))):
            cudf.to_datetime(gd_data)
    else:
        raise AssertionError("Was expecting `pd.to_datetime` to fail")
Exemple #2
0
def convert_datestring_to_days(df):
    import cudf

    df["d_date"] = (cudf.to_datetime(
        df["d_date"],
        format="%Y-%m-%d").astype("datetime64[s]").astype("int64") / 86400)
    df["d_date"] = df["d_date"].astype("int64")
    return df
Exemple #3
0
def test_to_datetime_errors(data):
    pd_data = data
    if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
        gd_data = cudf.from_pandas(pd_data)
    else:
        gd_data = pd_data

    exception_type = None
    try:
        pd.to_datetime(pd_data)
    except Exception as e:

        exception_type = type(e)

    if exception_type is None:
        raise TypeError("Was expecting `pd.to_datetime` to fail")

    with pytest.raises(exception_type):
        cudf.to_datetime(gd_data)
Exemple #4
0
def test_to_datetime_units(data, unit):
    pd_data = data
    if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
        gd_data = cudf.from_pandas(pd_data)
    else:
        gd_data = pd_data

    expected = pd.to_datetime(pd_data, unit=unit)
    actual = cudf.to_datetime(gd_data, unit=unit)

    assert_eq(actual, expected)
Exemple #5
0
def test_to_datetime_format(data, format, infer_datetime_format):
    pd_data = data
    if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
        gd_data = cudf.from_pandas(pd_data)
    else:
        gd_data = pd_data

    expected = pd.to_datetime(pd_data,
                              format=format,
                              infer_datetime_format=infer_datetime_format)
    actual = cudf.to_datetime(gd_data,
                              format=format,
                              infer_datetime_format=infer_datetime_format)

    assert_eq(actual, expected)
Exemple #6
0
def test_to_datetime_not_implemented():

    with pytest.raises(NotImplementedError):
        cudf.to_datetime([], exact=False)

    with pytest.raises(NotImplementedError):
        cudf.to_datetime([], origin="julian")

    with pytest.raises(NotImplementedError):
        cudf.to_datetime([], yearfirst=True)
Exemple #7
0
def make_capwin_dataset(**kwargs):
    def drop_index(df):
        return df.reset_index(drop=True)

    def smoosh(df):
        size = sum([df[x].dtype.itemsize for x in df])
        data = drop_index(drop_index(df).stack()).data
        dtype = cudf.utils.dtypes.min_unsigned_type(0, size * 8)
        return cudf.core.column.NumericalColumn(data, dtype=dtype)

    def add_edge_colors(edges, category):
        colors = drop_index(
            category_to_color(
                edges[category],
                color_palette=[
                    #  ADDRESS   AUTH KEYS CREDENTIALS       EMAIL      FALSE
                    4294967091,
                    4294410687,
                    4293138972,
                    4281827000,
                    33554431
                ]).astype(np.uint32))
        return edges.assign(color=smoosh(
            cudf.DataFrame({
                "src": drop_index(colors),
                "dst": drop_index(colors)
            })).astype(np.uint64),
                            src_color=colors)

    df = cudf.read_csv("data/pii_sample_for_viz.csv")
    df = df[["src_ip", "dest_ip", "pii", "timestamp"]]
    df["timestamp"] = cudf.to_datetime(df["timestamp"],
                                       format="%m/%d/%y %H:%M")
    # Create graph
    graph, nodes, edges = from_cudf_edgelist(df, "src_ip", "dest_ip")
    # Add vis components
    nodes = nodes.rename({"node": "name"}, axis=1, copy=False)
    nodes = annotate_nodes(graph, nodes, edges)
    # add edge colors
    edges = add_edge_colors(edges, "pii")
    print(edges.query("src_color != 33554431")["src"].value_counts())
    print(edges.query("src_color != 33554431")["dst"].value_counts())
    # add edge names
    edges["name"] = edges["src_ip"] + " -> " + edges["dest_ip"] + \
        ("\nPII: " + edges["pii"]).replace("\nPII: FALSE", "")
    return graph, nodes, edges
Exemple #8
0
def test_cudf_to_datetime(data, dayfirst, infer_datetime_format):
    pd_data = data
    if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)):
        gd_data = cudf.from_pandas(pd_data)
    else:
        if type(pd_data).__module__ == np.__name__:
            gd_data = cp.array(pd_data)
        else:
            gd_data = pd_data

    expected = pd.to_datetime(pd_data,
                              dayfirst=dayfirst,
                              infer_datetime_format=infer_datetime_format)
    actual = cudf.to_datetime(gd_data,
                              dayfirst=dayfirst,
                              infer_datetime_format=infer_datetime_format)

    assert_eq(actual, expected)
Exemple #9
0
def sse_numba(symbol,look_back):
    t1 = time.time()
    #print("start")
    #print("Symbol--------------------")
    #print(symbol)
    #print(look_back)
    
    
    forward_days = 5
    num_periods  = 205
    train_fraction = float(2.0 / 3.0)
    #print("train_fraction-------",train_fraction)
    if train_fraction > 1. or train_fraction < 0.:
        raise ValueError("ERROR:: train_fraction must have a value between 0 and 1")

    test_fraction = (1 - train_fraction)
    #print("test fraction--------",test_fraction)
    assert(test_fraction <= 1.0 and test_fraction > 0.)

    use_stateful_model = True
    #print(use_stateful_model)
    #file = "stock_market_data-AAPL.csv"
    file = "stock_market_data-%s.csv"%symbol
    dataset = cudf.read_csv(file)
    print(dataset)
    #Del unnamed column from dataset variable Unnamed: 0
    dataset.drop("Unnamed: 0", axis = 1,inplace=True)
    #Assert that open, high, low, close are numeric types
    t3 = time.time()
    print("time with numba :", t3-t1)

    assert (dataset["Open"].dtype  == np.dtype('float64') or dataset["Open"].dtype  == np.dtype('float32'))
    assert (dataset["High"].dtype  == np.dtype('float64') or dataset["High"].dtype  == np.dtype('float32'))
    assert (dataset["Low"].dtype   == np.dtype('float64') or dataset["Low"].dtype   == np.dtype('float32'))
    assert (dataset["Close"].dtype == np.dtype('float64') or dataset["Close"].dtype == np.dtype('float32'))

    dataset["Date"] = cudf.to_datetime(dataset["Date"])

    #assert(pd.api.types.is_datetime64_any_dtype(dataset["Date"]))
    dataset_close = dataset[["Date", "Close"]]
    print(dataset_close)
    print(type(dataset_close))
    
    dataset_close_diffed = difference(dataset_close["Close"], 1)

    num_test_start_points  = int(np.floor(dataset_close["Close"].shape[0] * test_fraction))

    num_train_start_points = dataset_close.shape[0] - num_test_start_points

    data_scaler, dataset_train_close, dataset_test_close =            prepare_data(dataset_close["Close"],    
                     n_test=int(np.floor(dataset_close["Close"].shape[0] * test_fraction)),   
                     n_lag=int(look_back),  
                     n_seq=int(forward_days))   
    #print(data_scaler)
    #print(dataset_train_close)
    #print(dataset_test_close)
    X_train_close = [] #np.empty(shape=(0, look_back, 1))
    y_train_close = [] #np.empty(shape=(0, forward_days, 1))
    X_test_close = [] #np.empty(shape=(0, look_back, 1))
    y_test_close = [] #np.empty(shape=(0, forward_days, 1))

    #subscript t in this case is to denote training dataset
    X_t, y_t = dataset_train_close[:, 0:look_back], dataset_train_close[:, look_back:]
    X_train_close = X_t.reshape(X_t.shape[0], 1, X_t.shape[1])
    y_train_close = y_t
    print(y_train_close)
    print(type(y_train_close))
    y_train_close = np.array(y_train_close).astype(np.float32)
    y_train_close = y_train_close.astype(np.float32)
    print(type(y_train_close))
    X_train_close = np.array(X_train_close).astype(np.float32)
    print(type(X_train_close))


    #v is for validation, just another suffix for "test" dataset
    X_v, y_v = dataset_test_close[:, 0:look_back], dataset_test_close[:, look_back:]
    X_test_close = X_v.reshape(X_v.shape[0], 1, X_v.shape[1])
    y_test_close = y_v
    print(y_test_close)
    print(type(y_test_close))
    y_test_close = np.asarray(y_test_close).astype(np.float32)
    print(type(y_test_close))



    multistep_model_close = build_lstm_network(X_seq_length=X_train_close.shape[1],
                                              X_feature_length=X_train_close.shape[2],
                                              y_seq_length=y_train_close.shape[1],
                                              num_sequences_per_batch=1,
                                              num_cell_units=12,
                                              keep_state=use_stateful_model)
    #print(multistep_model_close)
    t2 = time.time()
    #Set the number of epochs for training
    NUM_EPOCHS_TRAINING = 28
    #print(NUM_EPOCHS_TRAINING)
    if use_stateful_model:
        training_history = []
        for i in range(NUM_EPOCHS_TRAINING):
            print("i = {}".format(i))
            training_history.append(multistep_model_close.fit(X_train_close, y_train_close, epochs=1, batch_size=1, verbose=1, shuffle=False, use_multiprocessing=True, workers=8))
            multistep_model_close.reset_states()
    else:
        training_history = multistep_model_close.fit(X_train_close, y_train_close, epochs=NUM_EPOCHS_TRAINING, batch_size=0, validation_split=0.25, verbose=1, shuffle=False, use_multiprocessing=True, workers=8)
    print("it's over")
    
    forecasts = make_forecasts(multistep_model_close, 
                               n_batch=1,
                               train=X_train_close,
                               test=X_test_close,
                               n_test_instances=X_test_close.shape[0],
                               n_lag= 200,
                               n_seq=forward_days)
    #print("forecasts----------------------")
    #print(forecasts)
    inverted_forecasts = inverse_transform(dataset_close["Close"],
                                           forecasts,
                                           scaler=data_scaler,
                                           n_test=X_test_close.shape[0] + forward_days - 1)
    
    #print("inverted_forecasts------------------------------")
    #print(inverted_forecasts)
    #print(dataset_test_close)
    #actual = [row[200:] for row in dataset_test_close]
    actual = Actual(look_back,dataset_test_close)
    #print("actual")
    #print(actual)
    inverted_actual = inverse_transform(dataset_close["Close"],
                                        actual,
                                        scaler=data_scaler,
                                        n_test=X_test_close.shape[0] + forward_days - 1)
  
    #print("inverted_actual-------------------------------------")
    #print(inverted_actual)

    
    
    
    multistep_model_rmse_metrics = evaluate_forecasts(inverted_actual,
                                                      inverted_forecasts,
                                                      n_lag=look_back,
                                                      n_seq=forward_days,
                                                      metric="rmse")

    multistep_model_mape_metrics = evaluate_forecasts(inverted_actual,
                                                      inverted_forecasts,
                                                      n_lag=look_back,
                                                      n_seq=forward_days,
                                                      metric="mape")

    prediction_start_dates = dataset_close[num_train_start_points:]["Date"].values
    print(prediction_start_dates)
    print(type(prediction_start_dates))
    prediction_start_dates = cupy.reshape(prediction_start_dates, (prediction_start_dates.shape[0], 1))#.astype(cupy.float32)
    print(prediction_start_dates)
    print(type(prediction_start_dates))
    price_on_start_dates = dataset_close[num_train_start_points:]["Close"].values
    price_on_start_dates = np.reshape(price_on_start_dates, (price_on_start_dates.shape[0], 1)).astype(np.float64)
    actual_price_start_date_plus_five = dataset_close["Close"].values[num_train_start_points+forward_days:]
    forecast_shift = len(inverted_forecasts) - actual_price_start_date_plus_five.shape[0]
    for j in range(forecast_shift):
        print(actual_price_start_date_plus_five)
        X1 = ["nan"]
        #actual_price_start_date_plus_five = tuple(actual_price_start_date_plus_five)
        #actual_price_start_date_plus_five = cupy.concatenate([actual_price_start_date_plus_five, X1], axis=None)
        actual_price_start_date_plus_five += int(10)
        print("-------------------------------------------------------")
        print(actual_price_start_date_plus_five)
    actual_price_start_date_plus_five = np.reshape(actual_price_start_date_plus_five, (actual_price_start_date_plus_five.shape[0], 1))
    assert(actual_price_start_date_plus_five.shape[0] == len(inverted_forecasts))
    inverted_forecasts_as_np = np.array(inverted_forecasts)
    inverted_forecasts_with_dates = np.concatenate((prediction_start_dates, 
                                                    price_on_start_dates,
                                                    actual_price_start_date_plus_five,
                                                    inverted_forecasts_as_np), 
                                                    axis=1)
    df_forecast_five_day = cudf.DataFrame(data=inverted_forecasts_with_dates,
                                        columns=["Date",
                                                 "Actual_close",
                                                 "Actual_close_plus_5",
                                                 "Pred_close_plus_1",
                                                 "Pred_close_plus_2",
                                                 "Pred_close_plus_3",
                                                 "Pred_close_plus_4",
                                                 "Pred_close_plus_5"])

    df_forecast_five_day["Date"] = cudf.to_datetime(df_forecast_five_day["Date"], unit="ns")
    numeric_columns = [q for q in df_forecast_five_day.columns.tolist() if q != "Date"]
    df_forecast_five_day = df_forecast_five_day.round(decimals=2)
    ud_array = []
    up_array = []
    dn_array = []
    for index, row in df_forecast_five_day.iterrows():
         up   = ((row["Actual_close_plus_5"] - row["Actual_close"] >= 0) and ((row["Pred_close_plus_5"] - row["Actual_close"] >= 0)))
    down = ((row["Actual_close_plus_5"] - row["Actual_close"] <= 0) and ((row["Pred_close_plus_5"] - row["Actual_close"] <= 0)))
    ud_array = []
    up_array = []
    dn_array = []
    for index, row in df_forecast_five_day.iterrows():
        up   = ((row["Actual_close_plus_5"] - row["Actual_close"] >= 0) and ((row["Pred_close_plus_5"] - row["Actual_close"] >= 0)))
        down = ((row["Actual_close_plus_5"] - row["Actual_close"] <= 0) and ((row["Pred_close_plus_5"] - row["Actual_close"] <= 0)))
        ud = int(up or down)
        ud_array.append(ud)
        if up:
            up_array.append(up)
        elif down:
            dn_array.append(down)
    ud_array_as_np = np.array(ud_array)
    ud_array_as_np = np.reshape(ud_array_as_np, (ud_array_as_np.shape[0], 1)).astype(np.int32)
    df_forecast_five_day["ud"] = ud_array_as_np
    batting_average = float(df_forecast_five_day[df_forecast_five_day["ud"] == 1].shape[0] / df_forecast_five_day.shape[0])
    #print("last 6 rows------------------------")
    #print(df_forecast_five_day.tail(6))
    result = df_forecast_five_day.tail(6)

    t3 = time.time()
    print("time with numba :", t3-t2)
    
    
    return result 
Exemple #10
0
def run_everything():
    # ### 2.1 Prepare weather data
    # First, we will download the weather data.

    # In[2]:

    t_start = timer()

    filename = 'data/weather2011-2012.csv'

    # cuDF DataFrames are a tabular structure of data that reside on the GPU. We interface with these cuDF DataFrames in the same way we interface with Pandas DataFrames that reside on the CPU - with a few deviations. Load data from CSV file into a cuDF DataFrame.

    # In[3]:

    t_file_start = timer()
    weather = cudf.read_csv(filename)
    t_file = timer() - t_file_start

    # #### 2.1.1 Inspecting a cuDF DataFrame
    #
    # There are several ways to inspect a cuDF DataFrame. The first method is to enter the cuDF DataFrame directly into the REPL. This shows us an overview about the DatFrame including its type and metadata such as the number of rows or columns.

    # In[4]:

    # A second way to inspect a cuDF DataFrame is to wrap the object in a Python print function `print(weather)` function. This results in showing the rows and columns of the dataframe with simple formating.
    #
    # For very large dataframes, we often want to see the first couple rows. We can use the `head` method of a cuDF DataFrame to view the first N rows.

    # In[5]:

    # #### 2.1.2 Columns
    #
    # cuDF DataFrames store metadata such as information about columns or data types. We can access the columns of a cuDF DataFrame using the `.columns` attribute.

    # In[6]:

    # We can modify the columns of a cuDF DataFrame by modifying the `columns` attribute. We can do this by setting that attribute equal to a list of strings representing the new columns. Let's shorten the two longest column names!

    # In[7]:

    ### TODO rename the relative temperature column to RTemp, and the relative humidity to Humidity
    #weather.columns = ['Hour', 'Temperature', 'Relative Temperature', 'Rel. Humidity', 'Wind', 'Weather']
    weather.columns = [
        'Hour', 'Temperature', 'RTemp', 'Humidity', 'Wind', 'Weather'
    ]

    # #### 2.1.3 Series
    #
    # cuDF DataFrames are composed of rows and columns. Each column is represented using an object of type `Series`. For example, if we subset a cuDF DataFrame using just one column we will be returned an object of type `cudf.dataframe.series.Series`.

    # In[8]:

    humidity = weather['Humidity']
    #print(type(humidity))
    #print(humidity)

    # We also see a column of values on the left hand side with values 0, 1, 2, 3. These values represent the index of the Series.
    # The DataFrame and Series objects have both an index attribute that will be useful for joining tables and also for selecting data.

    # #### 2.1.4 Data Types
    #
    # We can also inspect the data types of the columns of a cuDF DataFrame using the `dtypes` attribute.

    # In[9]:

    #print(weather.dtypes)

    # We can modify the data types of the columns of a cuDF DataFrame by passing in a cuDF Series with a modified data type.

    # In[10]:

    weather['Humidity'] = weather['Humidity'].astype(np.float64)
    #print(weather.dtypes)

    # The 'Weather' column provides a description of the weather condidions. We should mark it as a categorical column.

    # In[11]:

    weather['Weather'] = weather['Weather'].astype('category')

    # After this step the numerical category codes can be accessed using the `.cat.codes` attribute of the column. We actually will not need the category labels, we just replace the 'Weather' column with the category codes.

    # In[12]:

    weather['Weather'] = weather['Weather'].cat.codes

    # The data type of the 'Hour' column is `object` which means a string. Let's convert this to a numeric value! This cannot be done with the `astype` method, you should use the [cudf.to_datetime](https://docs.rapids.ai/api/cudf/nightly/api.html#cudf.to_datetime) function!

    # In[13]:

    ### TODO convert the 'Hour' column from string to datetime
    weather['Hour'] = cudf.to_datetime(weather['Hour'])

    # #### 2.1.2 Prepare features
    # ##### Operations with cudf Series
    # We can perform mathematical operations on the Series data type. We will scale the Humidity and and Temperature variables, so that they lay in the [0, 1] range (some ML algorithms work better if the input data is scaled this way).

    # In[14]:

    weather['Humidity'] = weather['Humidity'] / 100.0

    # We will scale the temperature using the following formula T = (T - Tmin) / (Tmax - Tmin). First we select the min and max values.

    # In[15]:

    T = weather['Temperature']

    # Select the minimum temperature
    Tmin = T.min()

    ### TODO select the maximum temperature (1 line of code)
    Tmax = T.max()

    #print(Tmin, Tmax)

    # We could simply use the Tmin and Tmax values and apply the above formula on the series.
    #
    # ##### User defined functions (UDF)
    # We can write custom functions to operate on the data. When cuDF executes a UDF, it gets just-in-time (JIT) compiled into a CUDA kernel (either explicitly or implicitly) and is run on the GPU. Let's write a function that scales the temperature!

    # In[16]:

    def scale_temp(T):
        # Note that the Tmin and Tmax variables are stored during compilation time and remain constant afterwards
        T = (T - Tmin) / (Tmax - Tmin)
        return T

    # The applymap function will call scale_temp on all element of the series

    # In[17]:

    weather['Temperature'] = weather['Temperature'].applymap(scale_temp)

    # Lets do the same min-max scaling for the wind data

    # In[18]:

    ### TODO calculate the minimum and maximum values of the 'Wind' column (2 lines of code)
    Wmin = weather['Wind'].min()
    Wmax = weather['Wind'].max()

    #print(Wmin, Wmax)

    ### TODO define a scale_wind function and apply it on the Wind column (~ 2-3 lines of code)
    def scale_wind(w):
        return (w - Wmin) / (Wmax - Wmin)

    ### TODO apply the scale_wind function on the 'Wind' column
    weather['Wind'] = weather['Wind'].applymap(scale_wind)

    # Let's inspect the table, the Temperature, Wind and Humidity columns should have values in the [0, 1] range.

    # In[19]:

    weather.describe()

    # ##### Dropping Columns
    #
    # The relative temperature column is correlated with the temperature, it will not give much extra information for the ML model. We want to remove this column from our `DataFrame`. We can do so using the `drop_column` method. Note that this method removes a column in-place - meaning that the `DataFrame` we act on will be modified.

    # In[20]:

    weather.drop_column('RTemp')

    # If we want to remove a column without modifying the original DataFrame, we can use the `drop` method. This method will return a new DataFrame without that column (or columns).

    # ##### Index
    #
    # Like `Series` objects, each `DataFrame` has an index attribute.

    # In[21]:

    # We can use the index values to subset the `DataFrame`. Lets use this to plot the first 48 values. Before plotting we have to transfer from the GPU memory to the system memory. We use the `to_array` method to return a copy of the data as a numpy array.

    # In[22]:

    selection = weather[weather.index < 48]
    #plt.plot(selection['Hour'].to_array(), selection['Temperature'].to_array())
    #plt.xlabel('Hour')
    #plt.ylabel('Temperature [C]')

    # We can also change the index. Our dataset has one entry for each hour, so one could set the 'Hour' coulmn as index by calling
    # ```
    # weather = weather.set_index('Hour')
    # ```
    #
    # We do not perform this change now.

    # In[ ]:

    #weather = weather.set_index('Hour')

    # ### 2.2 Prepare bike sharing data
    # We start by downloading the data

    # In[24]:

    files = [
        'data/2011-capitalbikeshare-tripdata.csv',
        'data/2012Q1-capitalbikeshare-tripdata.csv',
        'data/2012Q2-capitalbikeshare-tripdata.csv',
        'data/2012Q3-capitalbikeshare-tripdata.csv',
        'data/2012Q4-capitalbikeshare-tripdata.csv'
    ]

    # Let's read the first file to have an idea of the dataset

    # In[25]:

    #cudf.read_csv(files[0])

    # We are only interested in the events when a bicicle was rented. Let us read the first column from all files, by specifying the `usecols` argument to [read_csv](https://docs.rapids.ai/api/cudf/nightly/api.html#cudf.io.csv.read_csv). We can use the `parse_dates` argument to parse the date string into a datetime variable, or the [to_datetime](https://docs.rapids.ai/api/cudf/nightly/api.html#cudf.to_datetime) function that we have used for the weather dataset. After all the tables are read we will concatenate them.
    #
    # Note: one has to specify a list of columns [ column1, column2 ] for the `usecol` argument.

    # In[26]:

    def read_bike_data(files):
        # Reads a list of files and concatenates them
        tables = []
        for filename in files:
            ### TODO read column 1 ('Start date') from the CSV file, and convert it to datetime format
            ### (1-2 lines of code)
            tmp_df = cudf.read_csv(filename, usecols=[1])

            ### END TODO
            tables.append(tmp_df)

        merged_df = cudf.concat(tables, ignore_index=True)

        # # Sanity checks
        # if merged_df.columns != ['Start date']:
        #     raise ValueError("Error incorrect set of columns read")
        # if merged_df['Start date'].dtype != 'datetime64[ns]':
        #     raise TypeError("Stard date should be converted to datetime type")

        return merged_df

    # In[27]:

    t_file_start = timer()
    bikes_raw = read_bike_data(files)
    t_file += timer() - t_file_start

    bikes_raw['Start date'] = cudf.to_datetime(bikes_raw['Start date'])

    # We want to count the number of rental events in every hour. We will define a new feature where we remove the minutes and seconds part of the time stamp. Since pandas has a convenient `floor` function defined to do it, we will convert the column to a pandas Series, transform it with the floor operation, and then put it back on the GPU.

    # In[28]:

    bikes_raw['Hour'] = bikes_raw['Start date'].to_pandas().dt.floor('h')

    # We will aggregate the number of bicicle rental events for each hour. We use the [groupby](https://docs.rapids.ai/api/cudf/nightly/api.html#groupby) function.

    # In[44]:

    bikes = bikes_raw.groupby('Hour').agg('count')
    bikes.columns = ['cnt']
    bikes.head(5)

    # In[32]:

    # bikes_raw_pd = bikes_raw.to_pandas()

    # In[45]:

    #bikes_pd = bikes_raw_pd.groupby('Hour').agg('count')
    #bikes_pd.columns = ['cnt']
    #bikes_pd.head(5)

    # Let's add a column to the new dataset: the date without the time of the day. We can derive that similarly to the 'Hour' feature above. After the groupby operation, the 'Hour' became the index of the dataset, we will apply the `floor` operation on the index.

    # In[64]:

    bikes['date'] = bikes.index.to_pandas().floor('D')

    # It will be usefull to define a set of additional features: hour of the day, day of month, month and year https://docs.rapids.ai/api/cudf/nightly/api.html#datetimeindex

    # In[53]:

    bikes['hr'] = bikes.index.hour

    ### TODO add year and month features (~ 2 lines of code)
    bikes['year'] = bikes.index.year
    bikes['month'] = bikes.index.month

    # #### Visualize data
    # It is a good practice to visulize the data. We will have to use the to_array() method to convert the cudF Series objects to numpy arrays that can be plotted.

    # In[56]:

    #plt.plot(bikes.index.to_array(), bikes['cnt'].to_array())

    # It is hard to see much apart from the global trend. Let's have a look how the 'cnt' variable looks like as a function the 'month' and 'hr' features. We will use [boxplot](https://seaborn.pydata.org/generated/seaborn.boxplot.html) from the Seaborn package.

    # In[57]:

    #fig, axes = plt.subplots(nrows=1,ncols=2)
    #fig.set_size_inches(12, 5)
    #sns.boxplot(data=bikes.to_pandas(), y="cnt",x="month",orient="v",ax=axes[0])
    #sns.boxplot(data=bikes.to_pandas(), y="cnt",x="hr",orient="v",ax=axes[1])
    #axes[0].set(xlabel='Months', ylabel='Count',title="Box Plot On Count Across months")
    #axes[1].set(xlabel='Hour Of The Day', ylabel='Count',title="Box Plot On Count Across Hour Of The Day")
    #plt.show()

    # #### 3.2.1 Combine weather data with bike rental data

    # In[65]:

    gdf_bw = bikes.merge(weather,
                         left_index=True,
                         right_on='Hour',
                         how='inner')

    # inspect the merged table
    #gdf_bw

    # We can see that the data is not sorted after the merge use the [sort_values](https://docs.rapids.ai/api/cudf/nightly/api.html#cudf.core.dataframe.DataFrame.sort_values) method to

    # In[66]:

    ### TODO sort the table according to the index (1 line of code)
    gdf_bw = gdf_bw.sort_values(by='Hour')

    # Inspect the sorted table
    #gdf_bw

    # ### 3.3 Add working day feature
    #
    # Apart from the weather, in important factor that influences people's daily activities is whether it is a working day or not. In this section we will create a working day feature. First we add the weekday as a new feature column.
    # We first
    # We can use the [weekday](https://docs.rapids.ai/api/cudf/nightly/api.html#cudf.core.series.DatetimeProperties.weekday) attribute of the [datetime](https://docs.rapids.ai/api/cudf/nightly/api.html#datetimeindex)

    # In[67]:

    gdf_bw['Weekday'] = gdf_bw['date'].dt.weekday

    # Next create a table with all the holidays in Washington DC in 2011-2011

    # In[68]:

    holidays = cudf.DataFrame({
        'date': [
            '2011-01-17', '2011-02-21', '2011-04-15', '2011-05-30',
            '2011-07-04', '2011-09-05', '2011-11-11', '2011-11-24',
            '2011-12-26', '2012-01-02', '2012-01-16', '2012-02-20',
            '2012-04-16', '2012-05-28', '2012-07-04', '2012-09-03',
            '2012-11-12', '2012-11-22', '2012-12-25'
        ],
        'Description': [
            "Martin Luther King Jr. Day", "Washington's Birthday",
            "Emancipation Day", "Memorial Day", "Independence Day",
            "Labor Day", "Veterans Day", "Thanksgiving", "Christmas Day",
            "New Year's Day", "Martin Luther King Jr. Day",
            "Washington's Birthday", "Emancipation Day", "Memorial Day",
            "Independence Day", "Labor Day", "Veterans Day", "Thanksgiving",
            "Christmas Day"
        ]
    })

    # Print the dataframe
    #holidays

    # We convert the date from string to datetime type, and drop the description column. Additionally we add a new column marked 'Holiday'. This will be useful to mark the holidays after we merge the tables.

    # In[69]:

    holidays['date'] = cudf.to_datetime(holidays['date'])
    holidays.drop_column('Description')
    holidays['Holiday'] = 1
    #holidays

    # Now we are ready to merge the tables.

    # In[76]:

    ### TODO merge tables and on the column 'date', use a left merge
    gdf = gdf_bw.merge(holidays, on='date', how='left')

    # inspect the result
    #gdf

    # We reset the index to 'Hour' and sort the table accordingly. Notice that most of the rows in the 'Holiday' column are filled with `<NA>`, only the dates that appeared in the holiday table are filled with 1. We shall fill the empty fields with zero.

    # In[77]:

    gdf = gdf.set_index('Hour')
    gdf = gdf.sort_index()

    ### TODO fill empty holiday values with zero
    gdf['Holiday'] = gdf['Holiday'].fillna(0)

    #gdf

    # Next, we create a workingday feature. One could do that simply with the following operation.
    # ```
    # gdf['Workingday'] = (gdf['Weekday'] < 5) & (gdf['Holiday']!=1)
    # ```
    # But we could do it with user defined functions too. Previously we have only used UDF to process elements of a series. Now we will process rows of a dataframe and
    # combine the 'Weekday' and 'Holiday' columns to calculate the now feature 'Workingday'

    # In[78]:

    def workday_kernel(Weekday, Holiday, Workingday):
        for i, (w, h) in enumerate(zip(Weekday, Holiday)):
            Workingday[i] = w < 5 and h != 1

    # In[79]:

    gdf = gdf.apply_rows(workday_kernel,
                         incols=['Weekday', 'Holiday'],
                         outcols=dict(Workingday=np.float64),
                         kwargs=dict())

    # More on user defined functions in our [blog](https://medium.com/rapids-ai/user-defined-functions-in-rapids-cudf-2d7c3fc2728d) and in the [documentation](https://docs.rapids.ai/api/cudf/nightly/guide-to-udfs.html).
    #
    # https://numba.pydata.org/numba-doc/dev/reference/pysupported.html

    # After this step we will not need the 'Holiday' and 'date' columns, we can drop them

    # In[80]:

    gdf = gdf.drop(['Holiday', 'date'])

    # ### 2.4 One-hot encoding
    #
    # We have all now the data in a single table, but we still want to change their encoding. We're going to create one-hot encoded variables, also known as dummy variables, for each of the time variables as well as the weather situation.
    #
    #
    # A summary from https://machinelearningmastery.com/why-one-hot-encode-data-in-machine-learning/:
    #
    # "The integer values have a natural ordered relationship between each other and machine learning algorithms may be able to understand and harness this relationship.
    # For categorical variables where no such ordinal relationship exists, the integer encoding is not enough.
    #
    # In fact, using this encoding and allowing the model to assume a natural ordering between categories may result in poor performance or unexpected results (predictions halfway between categories).
    #
    # In this case, a one-hot encoding can be applied to the integer representation. This is where the integer encoded variable is removed and a new binary variable is added for each unique integer value.
    # "
    #
    # We start by one-hot encoding the 'Weather' column using the [one_hot_encoding](https://docs.rapids.ai/api/cudf/nightly/api.html#cudf.core.dataframe.DataFrame.one_hot_encoding) method from cuDF DataFrame. This is very the [get_dummies](https://docs.rapids.ai/api/cudf/nightly/api.html#cudf.core.reshape.get_dummies) function (which might be more familiar for Pandas users), but one_hot_encoding works on a single input column and performs the operation in place.

    # In[81]:

    codes = gdf['Weather'].unique()
    gdf = gdf.one_hot_encoding('Weather', 'Weather_dummy', codes)
    # Inspect the results
    #gdf.head(3)

    # We're going to drop the original variable as well as one of the new dummy variables so we don't create colinearity (more about this problem [here](https://towardsdatascience.com/one-hot-encoding-multicollinearity-and-the-dummy-variable-trap-b5840be3c41a)).

    # In[82]:

    gdf = gdf.drop(['Weather', 'Weather_dummy_1'])

    # We create a copy of the dataset. It will make it easier to start over in case something would go wrong during the next excercise.

    # In[83]:

    gdf_backup = gdf.copy()

    # In[85]:

    dummies_list = ['month', 'hr', 'Weekday']

    gdf = gdf_backup.copy()

    for item in dummies_list:
        ### Todo implement one-hot encoding for item
        codes = gdf[item].unique()
        gdf = gdf.one_hot_encoding(item, item + '_dummy', codes)
        gdf = gdf.drop('{}_dummy_1'.format(item))
        gdf = gdf.drop(item)  # drop the original item

    gdf['year'] = gdf['year'] - 2011
    # gdf.drop_column('year') # will need leater for train-test spliw

    # ### 2.5 Save the prepared dataset

    # In[91]:

    t_file_start = timer()
    gdf.to_csv('data/bike_sharing.csv')
    t_file += timer() - t_file_start

    # ## 3. Predict bike rentals with cuML
    #
    # cuML is a GPU accelerated machine learning library. cuML's Python API mirrors the [Scikit-Learn](https://scikit-learn.org/stable/) API.
    #
    # cuML currently requires all data be of the same type, so this loop converts all values into floats

    # In[92]:

    t_cudf_stop = timer()

    # In[93]:

    t_cuml_start = t_cudf_stop

    for col in gdf.columns:
        gdf[col] = gdf[col].astype('float64')

    # ### 3.1 Prepare training and test data
    # It is customary to denote the input feature matrix with X, and the target that we want to predict with y. We separete the target column 'cnt' from the rest of the table.

    # In[94]:

    y = gdf['cnt']
    X = gdf.drop('cnt')

    # Let's split the data randomly into a train and a test set

    # In[95]:

    X_train, X_test, y_train, y_test = cuml.preprocessing.model_selection.train_test_split(
        X, y)

    #test = gdf.query('yr == 1') #.drop(dummies_list)
    #train = gdf.query('yr == 0') #.drop(dummies_list)

    # ### 3.2 Linear regression

    # In[111]:

    reg = cuml.LinearRegression()
    reg.fit(X_train, y_train)

    # In[129]:

    #X_train_np = X_train.as_matrix() #to_pandas().to_numpy()
    #y_train_np = y_train.to_array()
    #X_test_np = X_test.as_matrix() #to_pandas().to_numpy()
    #y_test_np = y_test.to_array()

    # In[100]:

    #import sklearn

    # In[109]:

    #reg_skl = sklearn.linear_model.LinearRegression()\nreg_skl.fit(X_train_np, y_train_np)

    # We can make prediction with the trained data

    # In[ ]:

    y_hat = reg.predict(X_test)

    # We can visualize the how well the model works. Let's plot data for may 2012:

    # In[112]:

    # In[114]:

    train_score = reg.score(X_train, y_train)
    ### TODO calculate test score (the score on X_test, ~ 1 line of code)
    test_score = reg.score(X_test, y_test)

    #print('train score', train_score)
    #print('test score', test_score)

    # ### 3.3  Save and load the trained model
    # We can pickle any cuML model

    # In[115]:

    t_file_start = timer()
    pickle_file = 'my_model.pickle'

    with open(pickle_file, 'wb') as pf:
        pickle.dump(reg, pf)

    # Load the saved model

    # In[116]:

    with open(pickle_file, 'rb') as pf:
        loaded_model = pickle.load(pf)

    t_file_cuml = timer() - t_file_start

    #print('Loaded model   score', loaded_model.score(X_test, y_test))
    #print('Original model score', reg.score(X_test, y_test))

    # ### 3.4 Ridge regression with hyperparameter tuning
    # We're going to do a small hyperparameter search for alpha, checking 100 different values. This is fast to do with RAPIDS. Also notice that we are appending the results of each Ridge model onto the dictionary containing our earlier results, so we can more easily see which model is the best at the end.

    # In[117]:

    output = {'score_OLS': test_score}

    for alpha in np.arange(0.01, 1, 0.01):  #alpha value has to be positive
        ridge = cuml.Ridge(alpha=alpha, fit_intercept=True)
        ### TODO fit the model and calculate the test score (2 lines of code)
        ridge.fit(X_train, y_train)
        score = ridge.score(X_test, y_test)
        ### END EXCERCISE ###
        output['score_RIDGE_{}'.format(alpha)] = score

    # Here we see that our regulaized model does better than the rest, include OLS with all the variables.

    # In[118]:

    #print('Max score: {}'.format(max(output, key=output.get)))

    # ### 3.5 Additional cuML models (Optional)
    # #### 3.5.1 Support vector regression

    # In[127]:

    # reg = cuml.svm.SVR(kernel='rbf', gamma=0.1, C=100, epsilon=0.1)
    # reg.fit(X_train, y_train)
    # reg.score(X_train, y_train)
    # reg.score(X_test, y_test)

    # In[130]:

    #reg = sklearn.svm.SVR(kernel='rbf', gamma=0.1, C=100, epsilon=0.1)
    #reg.fit(X_train_np, y_train_np)
    #reg.score(X_train_np, y_train_np)
    #reg.score(X_test_np, y_test_np)

    # #### 3.5.2 KNN Regression

    # In[134]:

    knn = cuml.neighbors.KNeighborsRegressor(n_neighbors=5)
    knn.fit(X_train, y_train, convert_dtype=True)
    pred = knn.predict(X_test)
    knn.score(X_test, y_test)

    #knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=5)
    #knn.fit(X_train_np, y_train_np,)
    #pred = knn.predict(X_test_np)
    #knn.score(X_test_np, y_test_np)

    t_stop = timer()
    t_cudf = t_cudf_stop - t_start
    t_cuml = t_stop - t_cuml_start  #- t_file_cuml
    return t_cudf, t_cuml, t_file, t_file_cuml
Exemple #11
0
def test_tf4rec():
    inputs = {
        "user_session":
        np.random.randint(1, 10000, NUM_ROWS),
        "product_id":
        np.random.randint(1, 51996, NUM_ROWS),
        "category_id":
        np.random.randint(0, 332, NUM_ROWS),
        "event_time_ts":
        np.random.randint(1570373000, 1670373390, NUM_ROWS),
        "prod_first_event_time_ts":
        np.random.randint(1570373000, 1570373382, NUM_ROWS),
        "price":
        np.random.uniform(0, 2750, NUM_ROWS),
    }
    df = nvt.dispatch._make_df(inputs)

    # categorify features

    cat_feats = (["user_session", "product_id", "category_id"] >>
                 nvt.ops.Categorify() >> nvt.ops.LambdaOp(lambda col: col + 1))

    # create time features
    sessionTs = ["event_time_ts"]

    sessionTime = (
        sessionTs >> nvt.ops.LambdaOp(lambda col: to_datetime(col, unit="s"))
        >> nvt.ops.Rename(name="event_time_dt"))

    sessionTime_weekday = (
        sessionTime >> nvt.ops.LambdaOp(lambda col: col.dt.weekday) >>
        nvt.ops.Rename(name="et_dayofweek"))

    def get_cycled_feature_value_sin(col, max_value):
        value_scaled = (col + 0.000001) / max_value
        value_sin = np.sin(2 * np.pi * value_scaled)
        return value_sin

    def get_cycled_feature_value_cos(col, max_value):
        value_scaled = (col + 0.000001) / max_value
        value_cos = np.cos(2 * np.pi * value_scaled)
        return value_cos

    weekday_sin = (sessionTime_weekday >>
                   (lambda col: get_cycled_feature_value_sin(col + 1, 7)) >>
                   nvt.ops.Rename(name="et_dayofweek_sin"))
    weekday_cos = (sessionTime_weekday >>
                   (lambda col: get_cycled_feature_value_cos(col + 1, 7)) >>
                   nvt.ops.Rename(name="et_dayofweek_cos"))
    from nvtabular.ops import Operator

    # custom op for item recency
    class ItemRecency(Operator):
        def transform(self, columns, gdf):
            for column in columns.names:
                col = gdf[column]
                item_first_timestamp = gdf["prod_first_event_time_ts"]
                delta_days = (col - item_first_timestamp) / (60 * 60 * 24)
                gdf[column + "_age_days"] = delta_days * (delta_days >= 0)
            return gdf

        def output_column_names(self, columns):
            return ColumnSelector(
                [column + "_age_days" for column in columns.names])

        def dependencies(self):
            return ["prod_first_event_time_ts"]

    recency_features = ["event_time_ts"] >> ItemRecency()
    recency_features_norm = (
        recency_features >> nvt.ops.LogOp() >> nvt.ops.Normalize() >>
        nvt.ops.Rename(name="product_recency_days_log_norm"))

    time_features = (sessionTime + sessionTime_weekday + weekday_sin +
                     weekday_cos + recency_features_norm)

    # Smoothing price long-tailed distribution
    price_log = (["price"] >> nvt.ops.LogOp() >> nvt.ops.Normalize() >>
                 nvt.ops.Rename(name="price_log_norm"))

    # Relative Price to the average price for the category_id
    def relative_price_to_avg_categ(col, gdf):
        epsilon = 1e-5
        col = ((gdf["price"] - col) / (col + epsilon)) * (col > 0).astype(int)
        return col

    avg_category_id_pr = (["category_id"] >> nvt.ops.JoinGroupby(
        cont_cols=["price"], stats=["mean"]) >>
                          nvt.ops.Rename(name="avg_category_id_price"))
    relative_price_to_avg_category = (avg_category_id_pr >> nvt.ops.LambdaOp(
        relative_price_to_avg_categ, dependency=["price"]) >> nvt.ops.Rename(
            name="relative_price_to_avg_categ_id"))

    groupby_feats = (["event_time_ts"] + cat_feats + time_features +
                     price_log + relative_price_to_avg_category)

    # Define Groupby Workflow
    groupby_features = groupby_feats >> nvt.ops.Groupby(
        groupby_cols=["user_session"],
        sort_cols=["event_time_ts"],
        aggs={
            "product_id": ["list", "count"],
            "category_id": ["list"],
            "event_time_dt": ["first"],
            "et_dayofweek_sin": ["list"],
            "et_dayofweek_cos": ["list"],
            "price_log_norm": ["list"],
            "relative_price_to_avg_categ_id": ["list"],
            "product_recency_days_log_norm": ["list"],
        },
        name_sep="-",
    )

    SESSIONS_MAX_LENGTH = 20
    MINIMUM_SESSION_LENGTH = 2

    groupby_features_nonlist = groupby_features["user_session",
                                                "product_id-count"]

    groupby_features_list = groupby_features[
        "price_log_norm-list", "product_recency_days_log_norm-list",
        "et_dayofweek_sin-list", "et_dayofweek_cos-list", "product_id-list",
        "category_id-list", "relative_price_to_avg_categ_id-list", ]

    groupby_features_trim = (groupby_features_list >> nvt.ops.ListSlice(
        0, SESSIONS_MAX_LENGTH) >> nvt.ops.Rename(postfix="_seq"))

    # calculate session day index based on 'event_time_dt-first' column
    day_index = ((groupby_features["event_time_dt-first"]) >>
                 nvt.ops.LambdaOp(lambda col: (col - col.min()).dt.days + 1) >>
                 nvt.ops.Rename(f=lambda col: "day_index"))

    selected_features = groupby_features_nonlist + groupby_features_trim + day_index

    filtered_sessions = selected_features >> nvt.ops.Filter(
        f=lambda df: df["product_id-count"] >= MINIMUM_SESSION_LENGTH)

    dataset = nvt.Dataset(df)

    workflow = nvt.Workflow(filtered_sessions)
    workflow.fit(dataset)
    sessions_gdf = workflow.transform(dataset).to_ddf().compute()

    assert not sessions_gdf.isnull().any().all()