Esempio n. 1
0
def test_cv():
    df = pd.read_pickle(os.path.join(root, '..', 'data', 'ta', 'base1', 'AAPL.pkl'))
    assert isinstance(df, pd.DataFrame)
    npDates = df["date"].unique()
    df.set_index(["date"], drop=True, inplace=True)
    assert df.shape == df.loc[npDates.tolist()].shape
    cv = TimeSeriesSplit(n_splits=5)
    for (train, test) in cv.split(npDates):
        train_size = len(df.loc[npDates[train]])
        test_size = len(df.loc[npDates[test]])
    assert len(df) == train_size + test_size
Esempio n. 2
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False,
            metrics='l1', verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
            metrics='l1', verbose_eval=False,
            callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
Esempio n. 3
0
def test_time_series_cv():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]

    # Should fail if there are more folds than samples
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         TimeSeriesSplit(n_splits=7).split(X))

    tscv = TimeSeriesSplit(2)

    # Manually check that Time Series CV preserves the data
    # ordering on toy datasets
    splits = tscv.split(X[:-1])
    train, test = next(splits)
    assert_array_equal(train, [0, 1])
    assert_array_equal(test, [2, 3])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3])
    assert_array_equal(test, [4, 5])

    splits = TimeSeriesSplit(2).split(X)

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2])
    assert_array_equal(test, [3, 4])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3, 4])
    assert_array_equal(test, [5, 6])

    # Check get_n_splits returns the correct number of splits
    splits = TimeSeriesSplit(2).split(X)
    n_splits_actual = len(list(splits))
    assert_equal(n_splits_actual, tscv.get_n_splits())
    assert_equal(n_splits_actual, 2)
Esempio n. 4
0
def timeSeriesSplit(cso = False):
    state = {0: 'NSW', 1: 'QLD', 2: 'SA', 3: 'TAS', 4: 'VIC'}
    year = {0: '2015', 1: '2016', 2: '2017'}
    
    df_nsw = pd.DataFrame()
    df_qld = pd.DataFrame()
    df_sa = pd.DataFrame()
    df_tas = pd.DataFrame()
    df_vic = pd.DataFrame()
    
    df = {'NSW': df_nsw, 'QLD': df_qld, 'SA': df_sa, 'TAS': df_tas, 'VIC': df_vic}
    
    df_nsw_test = pd.DataFrame()
    df_qld_test = pd.DataFrame()
    df_sa_test = pd.DataFrame()
    df_tas_test = pd.DataFrame()
    df_vic_test = pd.DataFrame()
    
    df_test = {'NSW': df_nsw_test, 'QLD': df_qld_test, 'SA': df_sa_test, 'TAS': df_tas_test, 'VIC': df_vic_test}
    
    for st in state.values():
        for ye in year.values():
            for mn in range(1,13):
                if mn < 10:            
                    dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + '0' + str(mn) +'_' + st + '1.csv')
                else:
                    dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + str(mn) +'_' + st + '1.csv')
                df[st] = df[st].append(dataset.iloc[:,1:3])
        df[st] = df[st].set_index('SETTLEMENTDATE')
       
    for st in state.values():
        dataset = pd.read_csv('./datasets/test/' + st + '/PRICE_AND_DEMAND_201801_' + st + '1.csv')
        df_test[st] = df_test[st].append(dataset.iloc[:,1:3])
        df_test[st] = df_test[st].set_index('SETTLEMENTDATE')
       
    # numpy array
    list_hourly_load_NSW = np.array(df['NSW'])
    list_hourly_load_QLD = np.array(df['QLD'])
    list_hourly_load_SA = np.array(df['SA'])
    list_hourly_load_TAS = np.array(df['TAS'])
    list_hourly_load_VIC = np.array(df['VIC'])
       
    # the length of the sequnce for predicting the future value
    sequence_length = 84
    x_size = 36
    hidden = 10
    y_size = 48
    
    # normalizing
    matrix_load_NSW = list_hourly_load_NSW / np.linalg.norm(list_hourly_load_NSW)
    matrix_load_QLD = list_hourly_load_QLD / np.linalg.norm(list_hourly_load_QLD)
    matrix_load_SA = list_hourly_load_SA / np.linalg.norm(list_hourly_load_SA)
    matrix_load_TAS = list_hourly_load_TAS / np.linalg.norm(list_hourly_load_TAS)
    matrix_load_VIC = list_hourly_load_VIC / np.linalg.norm(list_hourly_load_VIC)
    
    matrix_load_NSW = matrix_load_NSW[:-(len(matrix_load_NSW) % sequence_length)]
    matrix_load_QLD = matrix_load_QLD[:-(len(matrix_load_QLD) % sequence_length)]
    matrix_load_SA = matrix_load_SA[:-(len(matrix_load_SA) % sequence_length)]
    matrix_load_TAS = matrix_load_TAS[:-(len(matrix_load_TAS) % sequence_length)]
    matrix_load_VIC = matrix_load_VIC[:-(len(matrix_load_VIC) % sequence_length)]
    
    matrix_load_NSW = matrix_load_NSW.reshape(-1, sequence_length)
    matrix_load_QLD = matrix_load_QLD.reshape(-1, sequence_length)
    matrix_load_SA = matrix_load_SA.reshape(-1, sequence_length)
    matrix_load_TAS = matrix_load_TAS.reshape(-1, sequence_length)
    matrix_load_VIC = matrix_load_VIC.reshape(-1, sequence_length)
    
    # shuffle the training set (but do not shuffle the test set)
    np.random.shuffle(matrix_load_NSW)
    np.random.shuffle(matrix_load_QLD)
    np.random.shuffle(matrix_load_SA)
    np.random.shuffle(matrix_load_TAS)
    np.random.shuffle(matrix_load_VIC)
    
    # the training set
    X_NSW = matrix_load_NSW[:, :x_size]
    X_QLD = matrix_load_QLD[:, :x_size]
    X_SA = matrix_load_SA[:, :x_size]
    X_TAS = matrix_load_TAS[:, :x_size]
    X_VIC = matrix_load_VIC[:, :x_size]
    
    # the last column is the true value to compute the mean-squared-error loss
    y_NSW = matrix_load_NSW[:, x_size:]
    y_QLD = matrix_load_QLD[:, x_size:]
    y_SA = matrix_load_SA[:, x_size:]
    y_TAS = matrix_load_TAS[:, x_size:]
    y_VIC = matrix_load_VIC[:, x_size:]
    
    tscv = TimeSeriesSplit(n_splits=5)
    
    X = {'NSW': X_NSW, 'QLD': X_QLD, 'SA': X_SA, 'TAS': X_TAS, 'VIC': X_VIC}
    y = {'NSW': y_NSW, 'QLD': y_QLD, 'SA': y_SA, 'TAS': y_TAS, 'VIC': y_VIC}
    
    for st in state.values():
        print("State: ", st)
        i = 1
        for train_index, test_index in tscv.split(X[st]):
            X_train, X_test = X[st][train_index], X[st][test_index]
            y_train, y_test = y[st][train_index], y[st][test_index]
            
            print("Train and validation from state ", st, " split ", i)
            net = nt.Network([x_size, hidden, y_size], nt.Activation.tanh, nt.QuadraticCost)
            if cso:
                fname = "kernelBiasTimeSeries" + st + ".npy"
                if not path.exists(fname):
                    print("Weights and biases initialization for state ",st, " in progress...")
                    randInt = np.random.randint(X_train.shape[0])
                    net.cso(100,X_train[randInt].reshape(x_size,1),y_train[randInt].reshape(y_size,1),
                                net.multiObjectiveFunction,-0.6,0.6,net.dim ,100)
                    net.set_weight_bias(np.array(net.get_Gbest()))
                    np.save(fname, np.array(net.get_Gbest()))
                net.set_weight_bias(np.load(fname))

            if cso:
                fname = "results_" + st + "_TS_" + str(i) + "CSO"
            else:
                fname = "results_" + st + "_TS_" + str(i) + "GD"
            num_epochs = 1500
            lmbda = 2
            
            evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae = net.SGD(
                    X_train.transpose(),y_train.transpose(), num_epochs, 
                    10, 0.01, 
                    X_test.transpose(), y_test.transpose(), 
                    lmbda, monitor_evaluation_cost = True,
                    monitor_evaluation_accuracy = True,
                    monitor_training_cost = True,
                    monitor_training_accuracy = True,
                    output2D = True)
            
            f = open(fname, "w")
            json.dump([evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae], f)
            f.close()
                
#            make_plots(fname, num_epochs,
#                       training_cost_xmin = 0,
#                       test_accuracy_xmin = 0,
#                       test_cost_xmin = 0, 
#                       training_accuracy_xmin = 0)
            i = i+1
    def run_xgb_model(self):
        import xgboost as xgb
        from xgboost import XGBRegressor

        X = self.df.drop('spx', axis=1).iloc[:-1, :]
        y = self.df.spx.shift(-1).dropna()

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.33,
                                                            random_state=1,
                                                            shuffle=False)

        #DM_train = xgb.DMatrix(data=X_train, label=y_train)
        #DM_test = xgb.DMatrix(data=X_test, label=y_test)

        xgbm = xgb.XGBRegressor()
        xgbm.fit(X_train, y_train)

        gbm_param_grid = {
            'learning_rate': [.01, .1, .5, .9],
            'n_estimators': [200, 300],
            'subsample': [0.3, 0.5, 0.9],
            'max_depth': [2, 3],
            'reg_lambda': [0]
        }

        fit_params = {
            "early_stopping_rounds": 25,
            "eval_metric": "rmse",
            "eval_set": [(X_train, y_train), (X_test, y_test)]
        }

        #evals_result = {}
        #eval_s = [(X_train, y_train), (X_test, y_test)]

        tscv = TimeSeriesSplit(n_splits=2)
        xgb_Gridcv = GridSearchCV(estimator=xgbm,
                                  param_grid=gbm_param_grid,
                                  cv=tscv,
                                  refit=True,
                                  verbose=0)

        xgb_Gridcv.fit(X_train, y_train, **fit_params)
        ypred = xgb_Gridcv.predict(X_test)

        print(xgb_Gridcv.score(X_train, y_train))
        print(xgb_Gridcv.score(X_test, y_test))

        results = xgb_Gridcv.best_estimator_.evals_result()
        epochs = len(results['validation_0']['rmse'])
        x_axis = range(0, epochs)
        fig, ax = plt.subplots()
        ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
        ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
        ax.legend()
        plt.ylabel('Classification Error')
        plt.title('XGBoost Regression Error')
        plt.show()

        print('best parameters', xgb_Gridcv.best_params_)
        print('Lowest RMSE', np.sqrt(np.abs(xgb_Gridcv.best_score_)))

        y_actual = pd.DataFrame(y_test)
        y_pred = pd.DataFrame(ypred)

        y_pred.index = y_actual.index
        pred = pd.concat([y_actual, y_pred], axis=1)
        pred.columns = ['actual', 'pred']
        #pred.sort_index(inplace=True)
        pred_vals = pred.shift(1)
        pred_vals_diff = pred_vals.pct_change()
        pred_vals_diff.index = pred_vals.index
        pred_vals_diff.columns = ['actual', 'pred']
        pred_vals_diff.dropna(inplace=True)
        return pred_vals_diff
#from sklearn.tree.tree import DecisionTreeClassifier
#from sklearn.tree.export import export_graphviz
#import mglearn
#import graphviz
# =============================================================================
# AIC to measure the forecasts
def aic(y, y_pred, k):
    resid = np.array([y - y_pred]).T
    rss = np.sum(resid**2)
    AIC = 2 * k - 2 * len(y) * np.log(rss / len(y))
    return AIC


# F-test to compare restricted and unrestricted models
def F(y1, y1_pred, y2, y2_pred, p1, p2):
    resid1 = np.array([y1 - y1_pred]).T
    rss1 = np.sum(resid1**2)
    resid2 = np.array([y2 - y2_pred]).T
    rss2 = np.sum(resid2**2)
    F_stat = ((rss1 - rss2) / (p2 - p1) / (rss2 / (len(y2) - p2)))
    return F_stat


# =============================================================================
tsplit = TimeSeriesSplit(n_splits=5, max_train_size=250)
tsplit2 = TimeSeriesSplit(n_splits=3)
pca = PCA(n_components=3, whiten=1, random_state=42)
scaler = StandardScaler()
scaler2 = StandardScaler()
# =============================================================================
Esempio n. 7
0
def hts(y, h = 1, nodes = [[2]], method='OLS', freq = 'D', transform = None, include_history = True, cap = None, capF = None, changepoints = None, \
        n_changepoints = 25, yearly_seasonality = 'auto', weekly_seasonality = 'auto', daily_seasonality = 'auto', holidays = None, seasonality_prior_scale = 10.0, \
        holidays_prior_scale = 10.0, changepoint_prior_scale = 0.05, mcmc_samples = 0, interval_width = 0.80, uncertainty_samples = 0, skipFitting = False, numThreads = 0):
    '''
    Parameters
    ----------------
     y - dataframe of time-series data, or if you want to skip fitting, a dictionary of prophet base forecast dataframes
               Layout:
                   0th Col - Time instances
                   1st Col - Total of TS
                   2nd Col - One of the children of the Total TS
                   3rd Col - The other child of the Total TS
                   ...
                   ... Rest of the 1st layer
                   ...
                   Xth Col - First Child of the 2nd Col
                   ...
                   ... All of the 2nd Col's Children
                   ...
                   X+Yth Col - First Child of the 3rd Col
                   ...
                   ..
                   .   And so on...
    
     h - number of step ahead forecasts to make (int)
    
     nodes - a list or list of lists of the number of child nodes at each level
     Ex. if the hierarchy is one total with two child nodes that comprise it, the nodes input would be [2]
     
     method - String  the type of hierarchical forecasting method that the user wants to use. 
                Options:
                "OLS" - optimal combination by Original Least Squares (Default), 
                "WLSS" - optimal combination by Structurally Weighted Least Squares
                "WLSV" - optimal combination by Error Variance Weighted Least Squares
                "FP" - forcasted proportions (top-down)
                "PHA" - proportions of historical averages (top-down)
                "AHP" - average historical proportions (top-down)
                "BU" - bottom-up (simple addition)
                "CVselect" - select which method is best for you based on 3-fold Cross validation (longer run time)
     
     freq - (Time Frequency) input for the forecasting function of Prophet 
     
     transform - (None or "BoxCox") Do you want to transform your data before fitting the prophet function? If yes, type "BoxCox"
     
     include_history - (Boolean) input for the forecasting function of Prophet
                
     cap - (Dataframe or Constant) carrying capacity of the input time series.  If it is a dataframe, then
                                   the number of columns must equal len(y.columns) - 1
                                   
     capF - (Dataframe or Constant) carrying capacity of the future time series.  If it is a dataframe, then
                                    the number of columns must equal len(y.columns) - 1
     
     changepoints - (DataFrame or List) changepoints for the model to consider fitting. If it is a dataframe, then
                                        the number of columns must equal len(y.columns) - 1
     
     n_changepoints - (constant or list) changepoints for the model to consider fitting. If it is a list, then
                                         the number of items must equal len(y.columns) - 1
                                         
     skipFitting - (Boolean) if y is already a dictionary of dataframes, set this to True, and DO NOT run with method = "cvSelect" or transform = "BoxCox"
     
     numThreads - (int) number of threads you want to use when running cvSelect. Note: 14 has shown to decrease runtime by 10 percent 
                                 
     All other inputs - see Prophet
     
    Returns
    -----------------
     ynew - a dictionary of DataFrames with predictions, seasonalities and trends that can all be plotted
    
    '''

    # Function Definitions
    ##
    #  "Creating the summing matrix" funciton
    ##
    def SummingMat(nodes):
        '''
         This function creates a summing matrix for the bottom up and optimal combination approaches
         All the inputs are the same as above
         The output is a summing matrix, see Rob Hyndman's "Forecasting: principles and practice" Section 9.4
        '''
        numAtLev = list(map(sum, nodes))
        numLevs = len(numAtLev)
        top = np.ones(numAtLev[-1])  #Create top row, which is just all ones
        blMat = np.identity(
            numAtLev[-1])  #Create Identity Matrix for Bottom level Nodes
        finalMat = blMat
        ##
        # These two loops build the matrix from bottom to top
        ##
        for lev in range(numLevs - 1):
            summing = nodes[-(lev + 1)]
            count = 0
            a = 0
            num2sumInd = 0
            B = np.zeros([numAtLev[-1]])
            for num2sum in summing:
                num2sumInd += num2sum
                a = blMat[count:num2sumInd, :]
                count += num2sum
                if np.all(B == 0):
                    B = a.sum(axis=0)
                else:
                    B = np.vstack((B, a.sum(axis=0)))
            finalMat = np.vstack((B, finalMat))
            blMat = B
        ##
        # Append the Top array to the Matrix and then return it
        ##
        finalMat = np.vstack((top, finalMat))
        return finalMat

    ##
    # Error Handling
    ##
    if h < 1:
        sys.exit(
            'you must set h (number of step-ahead forecasts) to a positive number'
        )
    if method not in [
            'OLS', 'WLSS', 'WLSV', 'FP', 'PHA', 'AHP', 'BU', 'cvSelect'
    ]:
        sys.exit(
            "not a valid method input, must be one of the following: 'OLS','WLSS','WLSV','FP','PHA','AHP','BU','cvSelect'"
        )
    if len(nodes) < 1:
        sys.exit("nodes input should at least be of length 1")
    if not isinstance(
            cap, int) and not isinstance(cap, pd.DataFrame) and not isinstance(
                cap, float) and not cap is None:
        sys.exit(
            "cap should be a constant (float or int) or a DataFrame, or not specified"
        )
    if not isinstance(capF, int) and not isinstance(
            capF, pd.DataFrame) and not isinstance(capF,
                                                   float) and not capF is None:
        sys.exit(
            "capF should be a constant (float or int) or a DataFrame, or not specified"
        )
    if not isinstance(y, dict):
        if sum(list(map(sum, nodes))) != len(y.columns) - 2:
            sys.exit(
                "The sum of the nodes list does not equal the number of columns - 2, dataframe should contain a time column in the 0th pos. Double check node input"
            )
        if isinstance(cap, pd.DataFrame):
            if len(cap.columns) != len(y.columns) - 1:
                sys.exit(
                    "If cap is a DataFrame, it should have a number of columns equal to the input Dataframe - 1"
                )
        if isinstance(capF, pd.DataFrame):
            if len(capF.columns) != len(y.columns) - 1:
                sys.exit(
                    "If capF is a DataFrame, it should have a number of columns equal to the input Dataframe - 1"
                )
    if cap is not None and method not in ["BU", "FP", "AHP", "PHA"]:
        print(
            "Consider using BU, FP, AHP, or PHA.  The other methods can create negatives which would cause problems for the log() function"
        )
    ##
    # Transform Variables
    ##
    if transform is not None:
        if transform == 'BoxCox':
            y2 = y.copy()
            import warnings
            warnings.simplefilter("error", RuntimeWarning)
            boxcoxT = [None] * (len(y.columns.tolist()) - 1)
            try:
                for column in range(len(y.columns.tolist()) - 1):
                    y2.iloc[:, column + 1], boxcoxT[column] = boxcox(
                        y2.iloc[:, column + 1])
                y = y2
            ##
            # Does a Natural Log Transform if scipy's boxcox cant deal
            ##
            except RuntimeWarning:
                print(
                    "It looks like scipy's boxcox function couldn't deal with your data. Proceeding with Natural Log Transform"
                )
                for column in range(len(y.columns.tolist()) - 1):
                    y.iloc[:, column + 1] = boxcox(y.iloc[:, column + 1],
                                                   lmbda=0)
                    boxcoxT[column] = 0
        else:
            print(
                "Nothing will be transformed because the input was not = to 'BoxCox'"
            )
    else:
        boxcoxT = None
    ##
    # Run specified approach
    ##
    if method == 'cvSelect':
        ##
        # Run all of the Methods and let 3 fold CV chose which is best for you
        ##
        methodList = ['WLSV', 'WLSS', 'OLS', 'FP', 'PHA', 'AHP', 'BU']
        sumMat = SummingMat(nodes)
        tscv = TimeSeriesSplit(n_splits=3)
        MASE1 = []
        MASE2 = []
        MASE3 = []
        MASE4 = []
        MASE5 = []
        MASE6 = []
        MASE7 = []
        ##
        # Split into train and test, using time series split, and predict the test set
        ##
        y1 = y.copy()
        if boxcoxT is not None:
            for column in range(len(y.columns.tolist()) - 1):
                y1.iloc[:, column + 1] = inv_boxcox(y1.iloc[:, column + 1],
                                                    boxcoxT[column])

        for trainIndex, testIndex in tscv.split(y.iloc[:, 0]):
            if numThreads != 0:
                pool = ThreadPool(numThreads)
                results = pool.starmap(fitForecast, zip([y.iloc[trainIndex, :]]*7, [len(testIndex)]*7, [sumMat]*7, [nodes]*7, methodList, [freq]*7, [include_history]*7, [cap]*7, [capF]*7, [changepoints]*7, [n_changepoints]*7, \
                                    [yearly_seasonality]*7, [weekly_seasonality]*7, [daily_seasonality]*7, [holidays]*7, [seasonality_prior_scale]*7, [holidays_prior_scale]*7,\
                                    [changepoint_prior_scale]*7, [mcmc_samples]*7, [interval_width]*7, [uncertainty_samples]*7,  [boxcoxT]*7, [skipFitting]*7))
                pool.close()
                pool.join()
                ynew1, ynew2, ynew3, ynew4, ynew5, ynew6, ynew7 = results
            else:
                ynew1 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[0], freq, include_history, cap, capF, changepoints, n_changepoints, \
                                    yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                                    changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
                ynew2 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[1], freq, include_history, cap, capF, changepoints, n_changepoints, \
                                    yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                                    changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
                ynew3 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[2], freq, include_history, cap, capF, changepoints, n_changepoints, \
                                    yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                                    changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
                ynew4 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[3], freq, include_history, cap, capF, changepoints, n_changepoints, \
                                    yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                                    changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
                ynew5 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[4], freq, include_history, cap, capF, changepoints, n_changepoints, \
                                    yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                                    changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
                ynew6 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[5], freq, include_history, cap, capF, changepoints, n_changepoints, \
                                    yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                                    changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
                ynew7 = fitForecast(y.iloc[trainIndex, :], len(testIndex), sumMat, nodes, methodList[6], freq, include_history, cap, capF, changepoints, n_changepoints, \
                                    yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                                    changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
#
            for key in ynew1.keys():
                MASE1.append(
                    np.mean(
                        abs(ynew1[key].yhat[-len(testIndex):].values -
                            y1.iloc[testIndex, key + 1].values)))
                MASE2.append(
                    np.mean(
                        abs(ynew2[key].yhat[-len(testIndex):].values -
                            y1.iloc[testIndex, key + 1].values)))
                MASE3.append(
                    np.mean(
                        abs(ynew3[key].yhat[-len(testIndex):].values -
                            y1.iloc[testIndex, key + 1].values)))
                MASE4.append(
                    np.mean(
                        abs(ynew4[key].yhat[-len(testIndex):].values -
                            y1.iloc[testIndex, key + 1].values)))
                MASE5.append(
                    np.mean(
                        abs(ynew5[key].yhat[-len(testIndex):].values -
                            y1.iloc[testIndex, key + 1].values)))
                MASE6.append(
                    np.mean(
                        abs(ynew6[key].yhat[-len(testIndex):].values -
                            y1.iloc[testIndex, key + 1].values)))
                MASE7.append(
                    np.mean(
                        abs(ynew7[key].yhat[-len(testIndex):].values -
                            y1.iloc[testIndex, key + 1].values)))
        ##
        # If the method has the minimum Average MASE, use it on all of the data
        ##
        choices = [
            np.mean(MASE1),
            np.mean(MASE2),
            np.mean(MASE3),
            np.mean(MASE4),
            np.mean(MASE5),
            np.mean(MASE6),
            np.mean(MASE7)
        ]
        choice = methodList[choices.index(min(choices))]
        ynew = fitForecast(y, h, sumMat, nodes, choice, freq, include_history, cap, capF, changepoints, n_changepoints, \
                           yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                           changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
        print(choice)

    else:
        if skipFitting == True:
            theDictionary = y
            i = 0
            for key in y.keys():
                if i == 0:
                    y = pd.DataFrame(theDictionary[key].ds)
                y[i] = theDictionary[key].yhat
                i += 1
        sumMat = SummingMat(nodes)
        ynew = fitForecast(y, h, sumMat, nodes, method, freq, include_history, cap, capF, changepoints, n_changepoints, \
                           yearly_seasonality, weekly_seasonality, daily_seasonality, holidays, seasonality_prior_scale, holidays_prior_scale,\
                           changepoint_prior_scale, mcmc_samples, interval_width, uncertainty_samples, boxcoxT, skipFitting)
    ##
    # Inverse boxcox the data
    ##
    if transform is not None:
        if transform == 'BoxCox':
            for column in range(len(y.columns.tolist()) - 1):
                y.iloc[:, column + 1] = inv_boxcox(y.iloc[:, column + 1],
                                                   boxcoxT[column])
    ##
    # Put the values back in the dictionary for skipFitting
    ##
    if skipFitting == True:
        i = 0
        for key in theDictionary.keys():
            for column in theDictionary[key].columns:
                if column == 'yhat':
                    continue
                ynew[key][column] = theDictionary[key][column]
    ##
    # Rename keys so that dictionary can be easily understood
    ##

    i = -2
    for column in y:
        i += 1
        if i == -1:
            continue
        else:
            ynew[column] = ynew.pop(i)

    return ynew
df_complete = df_copy.drop(columns=drop_columns)
df_complete = fill_missing(df_complete)

# Num features to use
print("Nr Features:", df_until_now.shape[1])
nr_features = df_until_now.shape[1]
columns = df_until_now.columns

# To supervisioned
X, Y = to_supervised(df_until_now, timesteps, multisteps, nr_features)

print("Shape X:", X.shape)
print("Shape Y:", Y.shape)

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits)
cvscores = list()
split_num = 1
current_mae = 100
best_model = ''
for train_index, test_index in tscv.split(X):
    print(10*'-' + ' Begin Time Series Split Nº' + str(split_num) + ' ' + 10*'-')
    # Get values form time series split
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    # Create model
    model = build_model(timesteps, nr_features, multisteps)
    
    # Experiment the model
    lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='mae', factor=0.5, patience=patience, min_lr=0.00005)
Esempio n. 9
0
    def google_op_models(self):
        df = self.create_raw_dataset('GOOG')
        df = self.add_all_indicators(df, 'Close', 'High', 'Low', 'Volume')
        df - self.add_opinions(df)
        df['target'] = df['Adj Close']

        for x in range(len(df['Adj Close']) - 1):
            if df['Adj Close'][x] < df['Adj Close'][x + 1]:
                df['target'][x] = 1
            else:
                df['target'][x] = -1

        df = df[99:-1].reset_index().drop('index', axis=1)

        X = df[[
            'macd_op', 'macd_op2', 'macd_op3', 'roc_op', 'stoch_op', 'rsi_op',
            'wr_op', 'cci_op', 'adi_op'
        ]]
        y = df['target']

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            shuffle=False)
        tscv = TimeSeriesSplit(n_splits=5)

        svm_param_grid = {
            'C': [2**x for x in range(-5, 5)],
            'gamma': [2**x for x in range(-7, 1)],
            'kernel': ['rbf']
        }
        svm_grid = GridSearchCV(SVC(),
                                svm_param_grid,
                                verbose=1,
                                cv=tscv,
                                n_jobs=-1).fit(X_train, y_train)
        svm_model = SVC(kernel='rbf', C=1, gamma=1).fit(X_train, y_train)
        svm_pred = svm_model.predict(X_test)
        svm_acc = accuracy_score(y_test, svm_pred)

        rf_param_grid = {
            'bootstrap': [False],
            'max_depth': [None],
            'max_features': [None],
            'min_samples_leaf': [200, 250, 300],
            'min_samples_split': [2, 4, 8, 10],
            'n_estimators': [100]
        }
        rfgrid = GridSearchCV(RandomForestClassifier(),
                              param_grid=rf_param_grid,
                              cv=tscv,
                              scoring='accuracy',
                              n_jobs=-1,
                              verbose=1).fit(X_train, y_train)
        rf_model = RandomForestClassifier(bootstrap=False,
                                          n_estimators=200,
                                          min_samples_leaf=8,
                                          min_samples_split=8).fit(
                                              X_train, y_train)
        rf_pred = rf_model.predict(X_test)
        rf_acc = accuracy_score(y_test, rf_pred)

        knn_param_grid = {'n_neighbors': [x for x in range(100)]}
        knngrid = GridSearchCV(KNeighborsClassifier(),
                               param_grid=knn_param_grid,
                               cv=tscv,
                               scoring='accuracy',
                               verbose=1,
                               n_jobs=-1).fit(X_train, y_train)
        knn_model = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
        knn_pred = knn_model.predict(X_test)
        knn_acc = accuracy_score(y_test, knn_pred)

        print(
            'Prediction Accuracy of Google stock with the opinions approach: \n'
        )
        print(f'SVM Model Accuracy : {100*svm_acc:.2f}%')
        print(f'RF Model Accuracy : {100*rf_acc:.2f}%')
        print(f'KNN Model Accuracy : {100*knn_acc:.2f}%')
        trade_svm = self.tradetestreturn(
            svm_pred, df['Adj Close'][3072:].reset_index()['Adj Close'])
        trade_rf = self.tradetestreturn(
            rf_pred, df['Adj Close'][3072:].reset_index()['Adj Close'])
        trade_knn = self.tradetestreturn(
            knn_pred, df['Adj Close'][3072:].reset_index()['Adj Close'])

        print(f'SVM trade test Net Profit: ${trade_svm:.2f}')
        print(f'RF trade test Net Profit: ${trade_rf:.2f}')
        print(f'KNN trade test Net Profit: ${trade_knn:.2f}')
Esempio n. 10
0
# In[51]:

# cols_to_drop=['V300','V309','V111','C3','V124','V106','V125','V315','V134','V102','V123','V316','V113',
#               'V136','V305','V110','V299','V289','V286','V318','V103','V304','V116','V29','V284','V293',
#               'V137','V295','V301','V104','V311','V115','V109','V119','V321','V114','V133','V122','V319',
#               'V105','V112','V118','V117','V121','V108','V135','V320','V303','V297','V120']

# print('{} features are going to be dropped for being useless'.format(len(cols_to_drop)))

# X = X.drop(cols_to_drop, axis=1)
# test_X = test_X.drop(cols_to_drop, axis=1)

# In[52]:

folds = TimeSeriesSplit(n_splits=5)

aucs = list()
feature_importances = pd.DataFrame()
feature_importances['feature'] = X.columns

training_start_time = time()
for fold, (trn_idx, test_idx) in enumerate(folds.split(X, y)):
    start_time = time()
    print('Training on fold {}'.format(fold + 1))

    trn_data = lgb.Dataset(X.iloc[trn_idx], label=y.iloc[trn_idx])
    val_data = lgb.Dataset(X.iloc[test_idx], label=y.iloc[test_idx])
    #     clf = lgb.train(params, trn_data, num_boost_round = 10000, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds=500)
    clf = lgb.train(params,
                    trn_data,
Esempio n. 11
0
    fname = sys.argv[1]
    width = int(sys.argv[2])

    df = (feather.read_dataframe(fname)
                 .set_index('Datetime'))

    df_roll = window_stack(df, width=width)

    mem = df_roll.memory_usage(index=True, deep=True)
    print(mem)
    print(mem.sum()*1e-9)

    # Split target (time t) and variables (times t-1 to t-width+1)
    y = df_roll['t']
    X = df_roll.drop(columns='t', level='time')

    # Split train-test, approximately 12 and 4 months respectively
    X_train, X_test = X[:'2011-07-31'], X['2011-08-01':]
    y_train, y_test = y[:'2011-07-31'], y['2011-08-01':]

    enet = MultiOutputRegressor(ElasticNetCV(cv=TimeSeriesSplit(n_splits=5), l1_ratio=0.5), n_jobs=10)
    with timer():
        enet.fit(X_train, y_train)

    y_test_pred = pd.DataFrame(enet.predict(X_test), index=y_test.index, columns=y_test.columns)
    res = pd.concat((y_test, y_test_pred), axis=1, keys=['Actual', 'Pred'])

    with open('model_{}.pkl'.format(width), 'wb') as f:
        pickle.dump({'model': enet, 'pred': res}, f)
Esempio n. 12
0
    def train(self, replay_file=os.path.join('data_examples', 'btc_price_2017-09-13T03:45:28+00:00.csv')):
        # DATA PART #######################
        # removing the columns where the last price did not move. It biases the model.
        prices = pd.read_csv(replay_file, index_col=0, parse_dates=True)
        prices['last'] = prices[['last']].astype(np.float)
        prices['last'] = compute_returns(prices['last'])
        prices = prices[prices['last'] != 0]
        # splitting training, cv, test set
        prices_train, prices_cv, prices_test = split_prices(prices)

        # RUN PART #######################
        running_difference_tr = deque(maxlen=100)
        running_accuracy_tr = deque(maxlen=100)
        running_difference = deque(maxlen=100)
        running_accuracy = deque(maxlen=100)
        running_difference_cv = deque(maxlen=100)
        running_accuracy_cv = deque(maxlen=100)

        tscv = TimeSeriesSplit(n_splits=self.steps)

        for i, (train_index, cv_index) in enumerate(tscv.split(prices_train)):
            prices_train_fold = prices_train.iloc[train_index, :]
            prices_cv_fold = prices_train.iloc[cv_index, :]

            # gradient update
            x_train, t_train, y_train = get_batch(self.batch_size, prices_train_fold, self.sequence_length)
            st = time()
            _, te_loss_tr, be_loss_tr = self.sess.run([self.train_step, self.loss, self.benchmark_loss],
                                                      feed_dict={self.x_: x_train,
                                                                 self.y_: y_train,
                                                                 self.t_: t_train})  # gradient update.

            running_difference_tr.append(be_loss_tr - te_loss_tr)
            running_accuracy_tr.append(te_loss_tr < be_loss_tr)
            print(
                'steps = {0} | time {1:.3f} | te_loss_tr = {2:.6f}, be_loss_tr = {3:.6f}, r_diff_tr = {4:.6f}, r_acc_tr = {5:.3f}'.format(
                    str(i).zfill(6), time() - st, te_loss_tr, be_loss_tr, np.mean(running_difference_tr),
                    np.mean(running_accuracy_tr)))
            self.file_logger.write(
                [i, te_loss_tr, be_loss_tr, np.mean(running_difference_tr), np.mean(running_accuracy_tr)])

            # cross validation after gradient update step
            x_test, t_test, y_test = get_batch(self.batch_size, prices_cv_fold, self.sequence_length)
            te_loss, be_loss = self.sess.run([self.loss, self.benchmark_loss],
                                             feed_dict={self.x_: x_test,
                                                        self.y_: y_test,
                                                        self.t_: t_test})
            running_difference.append(be_loss - te_loss)
            running_accuracy.append(te_loss < be_loss)
            print(
                'steps = {0} | time {1:.3f} | te_loss_cv = {2:.6f}, be_loss_cv = {3:.6f}, r_diff_cv = {4:.6f}, r_acc_cv = {5:.3f}'.format(
                    str(i).zfill(6), time() - st, te_loss, be_loss, np.mean(running_difference),
                    np.mean(running_accuracy)))
            self.file_logger.write([i, te_loss, be_loss, np.mean(running_difference), np.mean(running_accuracy)])

        # cross validation after done training
        for i in range(self.cv_steps):
            x_cv, t_cv, y_cv = get_batch(self.batch_size, prices_cv, self.sequence_length)
            cv_loss, be_loss = self.sess.run([self.loss, self.benchmark_loss],
                                             feed_dict={self.x_: x_cv,
                                                        self.y_: y_cv,
                                                        self.t_: t_cv})

            running_difference_cv.append(be_loss - cv_loss)
            running_accuracy_cv.append(cv_loss < be_loss)
        print(
            'CV | cv_loss = {0:.6f}, be_loss = {1:.6f}, r_diff = {2:.6f}, r_acc = {3:.3f}'.format(
                cv_loss, be_loss, np.mean(running_difference_cv), np.mean(running_accuracy_cv)))
        self.file_logger.write([i, cv_loss, be_loss, np.mean(running_difference_cv), np.mean(running_accuracy_cv)])

        self.file_logger.close()
Esempio n. 13
0
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(df)
df = pd.DataFrame(scaler.transform(df), columns=df.columns)

min_sample_leaf = round(y.shape[0] * 0.0001)
min_sample_split = min_sample_leaf * 10
model = RandomForestRegressor(n_estimators=500,
                              min_samples_leaf=min_sample_leaf,
                              min_samples_split=min_sample_split,
                              random_state=42,
                              max_depth=None,
                              n_jobs=-1,
                              max_features=5)

skf = TimeSeriesSplit(n_splits=5)

y_pred_score = np.empty(shape=[
    0,
])
y_true = np.empty(shape=[
    0,
])
predicted_index = np.empty(shape=[
    0,
])

for train_index, test_index in skf.split(df, y):
    print('iter')
    X_train, X_test = df.loc[train_index].values, df.loc[test_index].values
    y_train, y_test = y[train_index], y[test_index]
Esempio n. 14
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Neural Network model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=False)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up a machine learning pipeline
            model = MLPRegressor(
                max_iter=25,
                hidden_layer_sizes=(64, 64),
                learning_rate_init=0.001,
                batch_size=16,
                alpha=0,
                learning_rate="adaptive",
                activation="relu",
                solver="adam",
                warm_start=True,
                shuffle=False,
                random_state=42,
                verbose=False,
            )
            if MULTI:
                model = MultiOutputRegressor(
                    model,
                    n_jobs=N_JOBS,
                )
            pipeline = Pipeline(
                [
                    ("var", VarianceThreshold()),
                    ("scale", MinMaxScaler()),
                    ("model", model),
                ]
            )

            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)

                # set up the tuner
                str_ = ""
                if MULTI:
                    str_ = "estimator__"
                parameters = {
                    f"model__{str_}hidden_layer_sizes": (
                        (32, 32),
                        (64, 64),
                        (128, 128),
                    ),
                    f"model__{str_}batch_size": (16, 32),
                    f"model__{str_}learning_rate_init": (0.0001, 0.001, 0.01),
                }
                grid = RandomizedSearchCV(
                    pipeline,
                    parameters,
                    n_iter=16,
                    cv=folds,
                    random_state=0,
                    n_jobs=1 if MULTI else N_JOBS,
                )

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")  # ignore common warning
                    object.__setattr__(
                        self,
                        "_model",
                        grid.fit(X, Y).best_estimator_,  # search for the best model
                    )
            else:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")  # ignore common warning
                    object.__setattr__(
                        self, "_model", pipeline.fit(X, Y)  # train the model
                    )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
    model_nm_dict = {}
    for option in parser.options(section):
        model_nm_dict[option] = [
            i for i in ast.literal_eval(parser.get(section, option))
        ]

# retrieve the selected models
for key, name in model_nm_dict.items():
    register_opt_estimators[key] = joblib.load(
        './mlp/optimised_models/register_opt_' + str(key) + '.pkl')
    guest_opt_estimators[key] = joblib.load(
        './mlp/optimised_models/guest_opt_' + str(key) + '.pkl')

## Cross validation metrics
# get number of cv splits
tscv = TimeSeriesSplit(n_splits=3)
counter = 0
# dictionary to store cv metrics in
cv_metrics_dict = {}
user_pred_dict = {
    'registered_users': 'pred_reg_user',
    'guest_users': 'pred_gs_user',
    # 'target': 'pred_target'
}
## Run cross_validation
for train_split_index, val_index in tscv.split(X_train):
    X_train_cv = X_train[train_split_index].copy()
    y_train_cv = y_train[train_split_index].copy()
    X_val_cv = X_train[val_index].copy()
    y_val_cv = y_train[val_index].copy()
    counter += 1
        include_flags=False,
        policy_category=PolicyCategory.HEALTH_INDICATORS,
        normalize=norm_data)
    train_x, train_y, test_x, test_y = countryPolicyCarbonData.split_train_test(
        fill_nan=False)
    train_features = train_features.append(train_x)
    test_features = test_features.append(test_x)
    train_labels = train_labels.append(train_y)
    test_labels = test_labels.append(test_y)

print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)
print(test_labels.shape)
# Train model with 5 fold cross validation
tss = TimeSeriesSplit()
_, n_features = train_features.shape
cnn = DeepLearningModel(training_config,
                        num_features=n_features,
                        num_outputs=1)
print(cnn.model.summary())
losses = []
start = time.time()
for train_idx, test_idx in tss.split(train_features):
    X, X_val = train_features.iloc[train_idx], train_features.iloc[test_idx]
    Y, Y_val = train_labels.iloc[train_idx], train_labels.iloc[test_idx]
    features, labels = utils.data_sequence_generator(
        X, Y, training_config['time_steps'])
    val_f, val_l = utils.data_sequence_generator(X_val, Y_val,
                                                 training_config['time_steps'])
    h = cnn.train_with_validation_provided(features, labels, val_f, val_l)
Esempio n. 17
0
    lis = []
    for i in range(1, n):
        pred_index = [n - i]
        if (n - i - max_train_size - period) >= 0:
            train_index = [
                j
                for j in range(n - i - max_train_size - period, n - i - period)
            ]
            lis.append((train_index, pred_index))
    lis.reverse()
    return lis


stock_num = dv.get_ts('close_adj').shape[1]
time_index = X.unstack().index.values
tscv = TimeSeriesSplit(max_train_size=5, n_splits=300)
pred = []
i = 0
for train_index, pred_index in split(X.unstack().index.values,
                                     max_train_size=120,
                                     period=period):
    i += 1
    indexer = [slice(None)] * 2
    indexer[X.index.names.index('trade_date')] = time_index[train_index]
    indexer2 = [slice(None)] * 2
    indexer2[X.index.names.index('trade_date')] = time_index[pred_index]
    #clf = RFR(max_depth=3,min_samples_leaf=9,max_leaf_nodes=4)
    #clf = SVR(C = 1)
    #clf = LinearRegression()
    #clf = Ridge()
    clf = LogisticRegression()
Esempio n. 18
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X,
                                               y,
                                               test_size=0.1,
                                               random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            stratified=False,
            shuffle=False,
            metrics='l1',
            verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params,
            lgb_train,
            num_boost_round=10,
            nfold=3,
            stratified=False,
            shuffle=True,
            metrics='l1',
            verbose_eval=False,
            callbacks=[
                lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)
            ])
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     lgb.cv(params_with_metric,
            lgb_train,
            num_boost_round=10,
            folds=folds,
            stratified=False,
            verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(
         os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {
         'objective': 'lambdarank',
         'verbose': -1,
         'eval_at': 3
     }
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     # ... with NDCG (default) metric
     cv_res = lgb.cv(params_lambdarank,
                     lgb_train,
                     num_boost_round=10,
                     nfold=3,
                     stratified=False,
                     verbose_eval=False)
     self.assertEqual(len(cv_res), 2)
     self.assertFalse(np.isnan(cv_res['ndcg@3-mean']).any())
     # ... with l2 metric
     cv_res = lgb.cv(params_lambdarank,
                     lgb_train,
                     num_boost_round=10,
                     nfold=3,
                     stratified=False,
                     metrics='l2',
                     verbose_eval=False)
     self.assertEqual(len(cv_res), 2)
     self.assertFalse(np.isnan(cv_res['l2-mean']).any())
Esempio n. 19
0
df[t + '_1d_r'] = np.log(df[t] / df[t].shift(1))
for lag in range(1, lags + 1):
    df[t + '_' + str(lag) + 'd_r'] = df[t + '_1d_r'].shift(lag)

# Define Model X and y
df[t + '_y'] = np.sign(np.log(
    df[t].shift(-1) /
    df[t]))  # dependent variable = 1 day future return on a binary basis
df.dropna(inplace=True)
X = df.filter(regex='_r').copy()
y = df[t + '_y']
y.head(5)

# train/validation split:
tscv = TimeSeriesSplit(
    n_splits=2
)  # generate train/cv indices => this generate 2 sets of train/cv indices
train_idx = list(tscv.split(df))[1][0]  # take the second set of train indices
X = X.iloc[train_idx]
y = y.iloc[train_idx]

# Model Training: Train simple Logit Model
model = LogisticRegression()
model.fit(X, y)
model.score(X, y)

#### Approach 1:Pickle
# import library
import pickle

pkl_file = "LOG_model.pkl"
Esempio n. 20
0
cv = PredefinedSplit(test_fold)

# Check that we only have a single train-test split, and the size
train_idx, test_idx = next(cv.split())
print(
    f"Splits: {cv.get_n_splits()}, Train size: {len(train_idx)}, Test size: {len(test_idx)}"
)

# Alternatively, we could want to use the [TimeSeriesSplit](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html#sklearn.model_selection.TimeSeriesSplit) cross-validator, which allows us to do several "into the future folds" for predictions

# In[14]:

from sklearn.model_selection import TimeSeriesSplit

# Here we just do 3-fold timeseries CV
cv = TimeSeriesSplit(max_train_size=None, n_splits=3)

# Let us check the sizes of the folds. Note that you can keep train size constant with max_train_size if needed
for i, (train_index, test_index) in enumerate(cv.split(X)):
    print(
        f"Split {i+1} / {cv.get_n_splits()}:, Train size: {len(train_index)}, Test size: {len(test_index)}"
    )

# ## Optimal xgBoost parameters
# ![](http://)After a few days of running for xgBoost, it found the following optimal parameters. Again, note that these gave me a 0.9769 score on [these features](https://www.kaggle.com/nanomathias/feature-engineering-importance-testing) and not the raw features, by training on the entire training set.

# In[ ]:

{
    'colsample_bylevel': 0.1,
    'colsample_bytree': 1.0,
Esempio n. 21
0
def Convultional(data, string):

    tscv = TimeSeriesSplit()
    TimeSeriesSplit(max_train_size=None, n_splits=5)
    a = []

    for train_index, test_index in tscv.split(data.scaled_dataset):

        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, y_train = data.Nueral_Network(data.scaled_dataset,
                                               data.scaled_dataset[:, -1], 0,
                                               data.train_set, data.timesteps)
        X_val, y_val = data.Nueral_Network(data.scaled_dataset,
                                           data.scaled_dataset[:, -1],
                                           data.train_set, data.validation_set,
                                           data.timesteps)
        X_test, y_test = data.Nueral_Network(data.scaled_dataset,
                                             data.scaled_dataset[:, -1],
                                             data.validation_set,
                                             data.test_set, data.timesteps)

        # Defines the models input shape, the loss fucntion, and the metric used for the error function.
        # The 'data' passed into the moudle as an argument calls on the each lots pre-preocessing moudle to
        # obtain the training, testing, and validation data sets

        input_shape = X_train.shape[-2:]
        loss = tf.keras.losses.MeanAbsoluteError()
        metric = tf.keras.metrics.MeanAbsolutePercentageError()

        # Reshapes the y_test numpy array so it cna be passes into the mean_absolute_percentage_error function
        # Reverses the scaler to re-obtain the atcual values of the data

        y_test_reshaped = y_test.reshape(-1, 1)
        y_test_inv = data.scaler.inverse_transform(y_test_reshaped)

        # Sets the amount of test sample to use in each iteration and shuffles the data to prevent over-fitting

        batch_size = 64
        shuffle_size = 64

        val = tf.data.Dataset.from_tensor_slices((X_val, y_val))
        val = val.cache().shuffle(shuffle_size).batch(shuffle_size).prefetch(1)

        train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
        train = train.cache().shuffle(shuffle_size).batch(
            shuffle_size).prefetch(1)

        # Builds the model. Filters defines the amopunt of sliding widnow that will move of the time series data
        # Kernal defines the size of the window
        # Strides defines how many inputs the window will move after each convultional
        # Padding handles null vlaues that may result from the other parameters
        # After the convultional layer, the dats's dimensions are reduced by the flatten() method and passed to a
        # traditonal MLP network with 50 layers and 1 output layer

        CNN = tf.keras.models.Sequential([
            Conv1D(filters=100,
                   kernel_size=2,
                   strides=1,
                   padding='causal',
                   activation='relu',
                   input_shape=input_shape),
            Flatten(),
            Dense(50, activation='relu'),
            Dense(1),
        ])

        optimizer = tf.keras.optimizers.Adam(lr=.0001, amsgrad=True)
        CNN.compile(loss=loss, optimizer=optimizer, metrics=metric)
        tf.keras.backend.set_epsilon(1)

        Model = CNN.fit(train, epochs=100, validation_data=val)

        # predict is a build in keras model that appleis the trianed network to new data
        # The forecats sclaer values are then transformed back to real vlaues and passed to the MAPE fucntion

        forecast = CNN.predict(X_test)
        CNN_forecast = data.scaler.inverse_transform(forecast)
        MAPE = mean_absolute_percentage_error(y_test_inv, CNN_forecast)

        a.append(np.array(MAPE))

        # MAPE and Loss are plotted

        plot_model_mape(Model, string)
        plot_model_loss(Model, string)

        # The modle and the wights are saved as JSON ands h5 files

    CNN_JSON = CNN.to_json()
    with open(
            "Project/Saved_Models/Buildings/" + string + "/CNN/" + string +
            "_CNN_LSTM.json", "w") as json_file:
        json_file.write(CNN_JSON)

    CNN.save_weights('Project/Saved_Models/Buildings/' + string +
                     '/CNN_LSTM/' + string + '_CNN_LSTM.h5')

    print('MLP forecast MAPE of hour-ahead electricity demand: {}'.format(a))
    return CNN
Esempio n. 22
0
def fit_feature_importance_cross_validation(ticker,
                                            feature_label_list,
                                            forest,
                                            X_data,
                                            y_data,
                                            splits=3):
    from sklearn.model_selection import TimeSeriesSplit
    #an example of TimeSeriesSplit
    # >> > for train_index, test_index in tscv.split(X):
    #     ...
    #     print("TRAIN:", train_index, "TEST:", test_index)
    # ...
    # X_train, X_test = X[train_index], X[test_index]
    # ...
    # y_train, y_test = y[train_index], y[test_index]
    # TRAIN: [0]
    # TEST: [1]
    # TRAIN: [0 1]
    # TEST: [2]
    # TRAIN: [0 1 2]
    # TEST: [3]

    # Initializes time series split object
    time_series_cv = TimeSeriesSplit(n_splits=splits)
    split_cnt = 1

    # Create time series split indices. Trains and tests
    # model on split data
    for train_index, test_index in time_series_cv.split(X_data):

        X_train, X_test = X_data[train_index], X_data[test_index]
        y_train, y_test = y_data[train_index], y_data[test_index]

        forest.fit(X_train, y_train)
        importances = forest.feature_importances_
        std = np.std(
            [tree.feature_importances_ for tree in forest.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        # Print accuracy
        print " Cross Valid " + str(split_cnt) + " for %s Finished" % ticker
        split_cnt = split_cnt + 1

        # Print the feature ranking
        print "Feature ranking for %s:" % ticker

        for f in range(X_train.shape[1]):
            print "No.%d feature %d %s (%f)" % (f + 1, indices[f],
                                                feature_label_list[indices[f]],
                                                importances[indices[f]])

        # Plot the feature importances of the forest
        plt.figure()
        plt.title("Feature importance for %s:" % ticker)
        plt.bar(range(X_train.shape[1]),
                importances[indices],
                color="g",
                yerr=std[indices],
                align="center")
        plt.xticks(range(X_train.shape[1]), indices)
        plt.xlim([-1, X_train.shape[1]])
        plt.grid()
        plt.show()
Esempio n. 23
0
    def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int],
            time_col: str) -> object:
        """
        Fits the model to the data

        :param ts_df The time series data to be used for fitting the model
        :type ts_df pd.DataFrame

        :param target_col The column name of the target time series that needs to be modeled.
        All other columns will be considered as exogenous variables (if applicable to method)
        :type target_col str

        :param cv: Number of folds to use for cross validation.
        Number of observations in the Validation set for each fold = forecast period
        If None, a single fold is used
        :type cv Optional[int]

        :param time_col: Name of the time column in the dataset (needed by Prophet)
        Time column can also be the index, in which case, this would be the name of the index
        :type time_col str

        :rtype object
        """
        # use all available threads/cores

        self.time_col = time_col
        self.original_target_col = target_col
        self.original_preds = [
            x for x in list(ts_df) if x not in [self.original_target_col]
        ]

        if len(self.original_preds) == 0:
            self.univariate = True
        else:
            self.univariate = False

        # print(f"Prophet Is Univariate: {self.univariate}")

        ts_df = copy.deepcopy(ts_df)

        ##### if you are going to use matplotlib with prophet data, it gives an error unless you do this.
        pd.plotting.register_matplotlib_converters()

        #### You have to import Prophet if you are going to build a Prophet model #############
        actual = 'y'
        timecol = 'ds'

        data = self.prep_col_names_for_prophet(ts_df=ts_df, test=False)

        if self.univariate:
            dft = data[[timecol, actual]]
        else:
            dft = data[[timecol, actual] + self.original_preds]

        ##### For most Financial time series data, 80% conf interval is enough...
        if self.verbose >= 1:
            print(
                '    Fit-Predict data (shape=%s) with Confidence Interval = %0.2f...'
                % (dft.shape, self.conf_int))
        ### Make Sure you lower your desired interval width from the normal 95% to a more realistic 80%
        start_time = time.time()

        if self.univariate is False:
            for name in self.original_preds:
                self.model.add_regressor(name)

        print("  Starting Prophet Fit")

        if self.seasonality:
            prophet_seasonality, prophet_period, fourier_order, prior_scale = get_prophet_seasonality(
                self.time_interval, self.seasonal_period)
            self.model.add_seasonality(name=prophet_seasonality,
                                       period=prophet_period,
                                       fourier_order=fourier_order,
                                       prior_scale=prior_scale)
            print(
                '       Adding %s seasonality to Prophet with period=%d, fourier_order=%d and prior_scale=%0.2f'
                % (prophet_seasonality, prophet_period, fourier_order,
                   prior_scale))
        else:
            print(
                '      No seasonality assumed since seasonality flag is set to False'
            )

        with SuppressStdoutStderr():
            self.model.fit(dft)
            self.train_df = copy.deepcopy(dft)

        print("  End of Prophet Fit")

        num_obs = dft.shape[0]
        NFOLDS = self.get_num_folds_from_cv(cv)

        if self.verbose >= 2:
            print(f"NumObs: {num_obs}")
            print(f"NFOLDS: {NFOLDS}")

        #########################################################################################
        # NOTE: This change to the FB recommendation will cause the cv folds from facebook to
        # be incompatible with the folds from the other models (in terms of periods of evaluation
        # as well as number of observations in each period). Hence the final comparison will
        # be biased since it will not compare the same folds.

        # The original implementation was giving issues under certain conditions, hence this change
        # to FB recommendation has been made as a temporary (short term) fix.
        # The root cause issue will need to be fixed eventually at a later point.
        #########################################################################################

        ### Prophet's Time Interval translates into frequency based on the following pandas date_range alias:
        #  Link: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
        ## This is done using the get_prophet_time_interval() function later.
        if self.time_interval in self.list_of_valid_time_ints:
            time_int = copy.deepcopy(self.time_interval)
        else:
            time_int = self.get_prophet_time_interval(for_cv=False)

        # First  Fold -->
        #   Train Set: 0:initial
        #   Test Set: initial:(initial+horizon)
        # Second Fold -->
        #   Train Set: (period):(initial+period)
        #   Test Set: (initial+period):(initial+horizon+ period)
        # Format: '850 D'

        print("  Starting Prophet Cross Validation")
        ################################################################################
        if self.forecast_period <= 5:
            #### Set a minimum of 5 for the number of rows in test!
            self.forecast_period = 5
        ### In case the number of forecast_period is too high, just reduce it so it can fit into num_obs
        if NFOLDS * self.forecast_period > num_obs:
            self.forecast_period = int(num_obs / (NFOLDS + 1))
            print('Lowering forecast period to %d to enable cross_validation' %
                  self.forecast_period)
        ###########################################################################################
        #cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period)
        max_trainsize = len(dft) - self.forecast_period
        cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size=max_trainsize)
        y_preds = pd.DataFrame()
        print('Max. iterations using expanding window cross validation = %d' %
              NFOLDS)
        start_time = time.time()
        rmse_folds = []
        norm_rmse_folds = []
        y_trues = pd.DataFrame()
        for fold_number, (train_index, test_index) in enumerate(cv.split(dft)):
            train_fold = dft.iloc[train_index]
            test_fold = dft.iloc[test_index]
            horizon = len(test_fold)
            print(
                f"\nFold Number: {fold_number+1} --> Train Shape: {train_fold.shape[0]} Test Shape: {test_fold.shape[0]}"
            )

            #########################################
            #### Define the model with fold data ####
            #########################################

            model = Prophet(growth="linear")

            ############################################
            #### Fit the model with train_fold data ####
            ############################################

            kwargs = {
                'iter': 1e2
            }  ## this limits iterations and hence speeds up prophet
            model.fit(train_fold, **kwargs)

            #################################################
            #### Predict using model with test_fold data ####
            #################################################

            future_period = model.make_future_dataframe(freq=time_int,
                                                        periods=horizon)
            forecast_df = model.predict(future_period)
            ### Now compare the actuals with predictions ######
            y_pred = forecast_df['yhat'][-horizon:]
            if fold_number == 0:
                y_preds = copy.deepcopy(y_pred)
            else:
                y_preds = y_preds.append(y_pred)
            rmse_fold, rmse_norm = print_dynamic_rmse(test_fold[actual],
                                                      y_pred,
                                                      test_fold[actual])
            print('Cross Validation window: %d completed' %
                  (fold_number + 1, ))
            rmse_folds.append(rmse_fold)
            norm_rmse_folds.append(rmse_norm)

        ######################################################
        ### This is where you consolidate the CV results #####
        ######################################################
        fig = model.plot(forecast_df)
        rmse_mean = np.mean(rmse_folds)
        print('Average CV RMSE over %d windows (macro) = %0.5f' %
              (fold_number + 1, rmse_mean))
        y_trues = dft[-y_preds.shape[0]:][actual]
        cv_micro = np.sqrt(mean_squared_error(y_trues.values, y_preds.values))
        print('Average CV RMSE of all predictions (micro) = %0.5f' % cv_micro)

        try:
            if self.verbose >= 2:
                quick_ts_plot(y_trues, y_preds)
            else:
                pass
        except:
            print('Error: Not able to plot Prophet CV results')

        forecast_df_folds = copy.deepcopy(y_preds)
        print("  End of Prophet Cross Validation")
        print('Time Taken = %0.0f seconds' % ((time.time() - start_time)))

        if self.verbose >= 1:
            print("Prophet CV DataFrame")
            #print(performance_metrics(df_cv).head())
        if self.verbose >= 2:
            print("Prophet plotting CV Metrics")
            #_ = plot_cross_validation_metric(df_cv, metric=self.scoring)
            #plt.show()

        #num_obs_folds = df_cv.groupby('cutoff')['ds'].count()

        # https://stackoverflow.com/questions/54405704/check-if-all-values-in-dataframe-column-are-the-same
        #a = num_obs_folds.to_numpy()
        #all_equal = (a[0] == a).all()

        #if not all_equal:
        #print("WARNING: All folds did not have the same number of observations in the validation sets.")
        #print("Num Test Obs Per fold")
        #print(num_obs_folds)

        #rmse_folds = []
        #norm_rmse_folds = []
        #forecast_df_folds = []

        #df_cv_grouped = df_cv.groupby('cutoff')
        #for (_, loop_df) in df_cv_grouped:
        #    rmse, norm_rmse = print_dynamic_rmse(loop_df['y'], loop_df['yhat'], dft['y'])
        #    rmse_folds.append(rmse)
        #    norm_rmse_folds.append(norm_rmse)
        #    forecast_df_folds.append(loop_df)

        # print(f"RMSE Folds: {rmse_folds}")
        # print(f"Norm RMSE Folds: {norm_rmse_folds}")
        # print(f"Forecast DF folds: {forecast_df_folds}")

        # forecast = self.predict(simple=False, return_train_preds=True)

        # ####  We are going to plot Prophet's forecasts differently since it is better
        # dfa = plot_prophet(dft, forecast);
        # # Prophet makes Incredible Predictions Charts!
        # ###  There can't be anything simpler than this to make Forecasts!
        # #self.model.plot(forecast);  # make sure to add semi-colon in the end to avoid plotting twice
        # # Also their Trend, Seasonality Charts are Spot On!
        # try:
        #     self.model.plot_components(forecast)
        # except:
        #     print('Error in FB Prophet components forecast. Continuing...')

        #rmse, norm_rmse = print_dynamic_rmse(dfa['y'], dfa['yhat'], dfa['y'])
        print('---------------------------')
        print('Final Prophet CV results:')
        print('---------------------------')
        rmse, norm_rmse = print_dynamic_rmse(y_trues, y_preds, y_trues)

        #return self.model, forecast, rmse, norm_rmse
        return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds
Esempio n. 24
0
def grid_search(df,
                lambda2_range,
                sigma2_range,
                burn_in=300,
                n_splits=15,
                return_mean_vld_error=False,
                verbose=False):
    """Find the best Kalman filter parameters via grid search cross-validation.

    This function perform a grid search of the optimal (lambda2, r)
    parameters of the pykalman.KalmanFilter on input data where:
    
    transition_matrix      -> F = [[2,-1], [1, 0]] (double-integrated random-walk model)
    transition_covariance  -> Q = [[lambda2, 0], [0, 0]]
    observation_covariance -> R = [sigma2]
    observation_model      -> H = [1, 0]
    
    as in [1]. In this function lambda2 and sigma2 are not estimated
    using the Bayesian framework described in [1], but they are
    obtained via cross-validation. The optimization is ran on ...
    

    Parameters
    -------------------
    df : DataFrame, the output returned by gluco_extract(..., return_df=True)
    burn_in : number, the number of samples at the beginning of the time-series
              that should be splitted to perform grid search (default = 300)
    n_splits : number, the number of splits of the time-series cross-validation
               schema (default=15). Your prediction horizon will be
               `floor(n_samples / (n_splits + 1))`
    [....]
    return_mean_vld_error : bool, return the average validation error (default=False)
    verbose : bool, print debug messages (default=False)

    Returns
    -------------------
    [...]
    
    References
    -------------------
    [1] Facchinetti, Andrea, Giovanni Sparacino, and Claudio Cobelli.
    "An online self-tunable method to denoise CGM sensor data."
    IEEE Transactions on Biomedical Engineering 57.3 (2010): 634-641.
    """
    n_samples = df.shape[0]

    # Argument check
    if n_samples < burn_in:
        raise Exception('The number of burn in samples %d should be '
                        'smaller than the total number of samples '
                        '%d' % (burn_in, n_samples))

    # State-space model
    F = np.array([[2, -1],
                  [1, 0]])  # transition matrix (double integration model)
    H = np.array([1, 0])  # measures matrix

    # Isolate the burn in samples
    time_series = df.iloc[:burn_in]

    # Parameter grid definition
    param_grid = ParameterGrid({
        'lambda2': lambda2_range,  # see state covariance Q
        'sigma2': sigma2_range
    })  # noise variance

    # Time-series cross validation split
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # Initialize the cross-validation error matrix of size
    # (len(lambda2_range), len(sigma2_range))
    mean_vld_error = np.zeros((len(lambda2_range), len(sigma2_range)))
    std_vld_error = np.zeros_like(mean_vld_error)

    # Positions dictionary
    d_lambda = dict(zip(lambda2_range, np.arange(len(lambda2_range))))
    d_sigma = dict(zip(sigma2_range, np.arange(len(sigma2_range))))

    # Iterate trough the parameters lambda2, sigma2
    # i, j index will be used to access the mean_vld_error matrix
    for param in param_grid:
        if verbose: print('trying params {} ...'.format(param))
        l2, s2 = param['lambda2'], param['sigma2']

        Q = np.array([[l2, 0], [0, 0]])  # transition_covariance
        R = s2  # observation (co)variance

        # Init the vld_error vector for the current order
        vld_error = np.zeros(n_splits)

        # Iterate through the CV splits
        for cv_count, (tr_index,
                       vld_index) in enumerate(tscv.split(time_series)):
            if cv_count == 0:  # init X0 and P0 via EM on the first chunk of data
                y_0 = time_series.iloc[np.hstack(
                    (tr_index, vld_index))].values.ravel()

                # Init KalmanFilter object
                kf = CGMKalmanFilter(F=F, Q=Q, R=R, X0=None, P0=None)
                kf.em(y_0,
                      em_vars=('initial_state_mean',
                               'initial_state_covariance'))
            else:
                y_tr = time_series.iloc[tr_index].values.ravel()
                y_vld = time_series.iloc[vld_index].values.ravel()
                y_pred, X_new, P_new, kf = forecast(kf=kf,
                                                    n_steps=len(y_vld),
                                                    H=H,
                                                    y=y_tr,
                                                    return_first_kf=True)

                # Save vld error
                vld_error[cv_count] = mean_squared_error(y_pred, y_vld)

        # Save mean and standard deviation of cross-validation error
        # (excluding NaNs)
        i, j, = d_lambda[l2], d_sigma[s2]
        mean_vld_error[i, j] = np.nanmean(vld_error)
        std_vld_error[i, j] = np.nanstd(vld_error)

    # Get the optimal orders from the score that we want to optimize
    final_index = mean_vld_error
    i_opt, j_opt, = np.argwhere(final_index == np.nanmin(mean_vld_error))[0]

    # Multiple returns
    ret = [lambda2_range[i_opt], sigma2_range[j_opt]]
    if return_mean_vld_error:
        ret.append(mean_vld_error)
    return ret
        0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3,
        1.4, 1.5
    ],
    'node_to_node__bias_scaling': [0.0],
    'node_to_node__bi_directional': [False],
    'node_to_node__continuation': [True],
    'node_to_node__activation': ['tanh'],
    'node_to_node__wash_out': [0],
    'node_to_node__random_state': [42],
    'regressor__alpha': [1e-5],
    'random_state': [42]
}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)

ts_split = TimeSeriesSplit()
grid_search = GridSearchCV(ESNRegressor(),
                           cv=ts_split,
                           param_grid=param_grid,
                           scoring=scorer,
                           n_jobs=-1).fit(X=X_train, y=y_train)

print(grid_search.best_params_)

esn = grid_search.best_estimator_
# esn.set_params(**{'node_to_node__leakage': 0.1})
# esn.fit(X=X_train, y=y_train)
esn.predict(X=unit_impulse)
fig = plt.figure()
im = plt.imshow(np.abs(esn._node_to_node._hidden_layer_state[:, 1:].T),
                vmin=0,
Esempio n. 26
0
logging.debug('\n\n=== Bagging times =========')
bag_times = config['bagging_times']
logging.debug(bag_times)

logging.debug('\n\n=== random_seed_average times =========')
random_seed_average_times = config['random_seed_average_times']
logging.debug(random_seed_average_times)

logging.debug('\n\n=== N Folds =========')
n_fold = config['n_fold']
logging.debug(n_fold)

logging.debug('\n\n=== Folds Type =========')
folds_type = {
    'time_series': TimeSeriesSplit(n_fold),
    'k_fold': KFold(n_fold),
    'group_k_fold': GroupKFold(n_fold),
    'train_test_split_time_series': 'train_test_split_time_series'
}
folds = folds_type[config['folds_type']]
logging.debug(config['folds_type'])
if config['folds_type'] == 'group_k_fold':
    split_groups = train['DT_M']
else:
    split_groups = None

logging.debug('\n\n=== train shape =========')
logging.debug(train.shape)
print('train shape', train.shape)
Esempio n. 27
0
def hyper_params_search(df,
                        target_name,
                        scorer,
                        wrapper,
                        n_iter,
                        n_splits,
                        n_jobs,
                        verbose,
                        seed):
    """
    Use the dataframe 'df' to search for the best
    params for the model 'wrapper'.
    The CV split is performed using the TimeSeriesSplit
    class.
    We can define the size of the test set using the formula
    ``n_samples//(n_splits + 1)``,
    where ``n_samples`` is the number of samples. Hence,
    we can define
    n_splits = (n - test_size) // test_size
    :param df: train data
    :type df: pd.DataFrame
    :param wrapper: predictive model
    :type wrapper: sklearn model wrapper
    :param n_iter: number of hyperparameter searchs
    :type n_iter: int
    :param n_splits: number of splits for the cross-validation
    :type n_splits: int
    :param n_jobs: number of concurrent workers
    :type n_jobs: int
    :param verbose: param to print iteration status
    :type verbose: bool, int
    :param target_name: name of the target column in 'df'
    :type target_name: str
    :return: R2 value
    :rtype: float
    """

    X = df.drop(target_name, 1).values
    y = df[target_name].values

    time_split = TimeSeriesSplit(n_splits=n_splits)

    if wrapper.search_type == 'random':
        model_search = RandomizedSearchCV(estimator=wrapper.ModelClass,
                                          param_distributions=wrapper.param_grid,
                                          n_iter=n_iter,
                                          cv=time_split,
                                          verbose=verbose,
                                          n_jobs=n_jobs,
                                          scoring=scorer,
                                          random_state=seed)
    elif wrapper.search_type == 'grid':
        model_search = GridSearchCV(estimator=wrapper.ModelClass,
                                    param_grid=wrapper.param_grid,
                                    cv=time_split,
                                    verbose=verbose,
                                    n_jobs=n_jobs,
                                    scoring=scorer)
    else:
        raise Exception('search type method not registered')

    model_search = model_search.fit(y=y,
                                    X=X)

    return model_search
Esempio n. 28
0
def rf_gridcv(df,
              fld='Ex',
              pth='',
              name=None,
              fi_plts=False,
              test_size=None,
              newmodel=False,
              zave=False,
              err_metric='mae'):
    '''
    Grid search with cross validation
    Training and test splits are not random
    '''

    from preprocess import train_test_seq
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import mean_absolute_error, mean_squared_error

    tst_sz = test_size if test_size is not None else 0.2
    totsz = df.shape[0]
    train_size = 1 - tst_sz

    title = name if name is not None else 'df'

    if zave:
        flds = ['Ex2l', 'Ey2l']
        # Specify the hyperparameter space
        parameters = {
            'rf__max_depth': [4, 8],
            'rf__max_features': ['auto', 'sqrt', None],
            'rf__min_samples_leaf': [4, 8, 16],
            'rf__n_estimators': [8, 16, 32, 64]
        }

    else:
        flds = ['Ex', 'Ey']
        # Specify the hyperparameter space
        parameters = {
            'rf__max_depth': [2, 4, 8],
            'rf__max_features': ['auto', 'sqrt', None],
            'rf__min_samples_leaf': [2, 4, 8],
            'rf__n_estimators': [4, 8, 16, 32]
        }

    if err_metric == 'mae':
        scoring = 'neg_mean_absolute_error'
    else:
        scoring = 'neg_mean_squared_error'

    X_train, X_test, y_train, y_test = train_test_seq(df.drop(flds, axis=1),
                                                      df[fld],
                                                      test_size=tst_sz)

    print("Test,train shapes:", X_test.shape, X_train.shape)

    # GRID SEARCH CV

    # Setup the pipeline steps: steps
    steps = [('rf',
              RandomForestRegressor(criterion=err_metric,
                                    bootstrap=False,
                                    random_state=42))]

    # Create the pipeline: pipeline
    pipeline = Pipeline(steps)

    # Use TimeSeriesSplit instead of the default random splits used by GridSearchCV
    my_cv = TimeSeriesSplit(n_splits=2).split(X_train)

    # Create the GridSearchCV object: gm_cv
    gm_cv = GridSearchCV(pipeline,
                         parameters,
                         cv=my_cv,
                         verbose=True,
                         n_jobs=4,
                         scoring=scoring,
                         return_train_score=True)

    # Fit to the training set
    gm_cv.fit(X_train, y_train)

    # Print SCORES with deviations
    means = gm_cv.cv_results_['mean_test_score']
    stds = gm_cv.cv_results_['std_test_score']
    print(f"Means of CV folds: {means}")
    print(f"STDs of CV folds : {stds}")

    # https://github.com/amueller/COMS4995-s19/blob/master/slides/aml-08-trees-forests/aml-10.ipynb
    # Plot error vs various hyperparameters
    scores = pd.DataFrame(gm_cv.cv_results_)
    print(scores.head())

    plt.figure(0)

    scores.plot('param_rf__max_depth', 'mean_train_score')
    scores.plot('param_rf__max_depth', 'mean_test_score', ax=plt.gca())
    plt.fill_between(scores.param_rf__max_depth.astype(np.float),
                     scores['mean_train_score'] + scores['std_train_score'],
                     scores['mean_train_score'] - scores['std_train_score'],
                     alpha=0.2)
    plt.fill_between(scores.param_rf__max_depth.astype(np.float),
                     scores['mean_test_score'] + scores['std_test_score'],
                     scores['mean_test_score'] - scores['std_test_score'],
                     alpha=0.2)
    plt.legend()
    plt.savefig("rf_grid_max_depth.pdf", bbox_inches="tight")

    plt.figure(1)
    scores.plot(x='param_rf__max_depth',
                y='mean_train_score',
                yerr='std_train_score',
                ax=plt.gca())
    scores.plot(x='param_rf__max_depth',
                y='mean_test_score',
                yerr='std_test_score',
                ax=plt.gca())
    plt.savefig("rf_grid_max_depth_.pdf", bbox_inches="tight")

    # Plot error vs various hyperparameters
    plt.figure(2)
    scores.plot(x='param_rf__n_estimators',
                y='mean_train_score',
                yerr='std_train_score',
                ax=plt.gca())
    scores.plot(x='param_rf__n_estimators',
                y='mean_test_score',
                yerr='std_test_score',
                ax=plt.gca())
    plt.savefig("rf_grid_n_estimators.pdf", bbox_inches="tight")

    # Predict
    y_pred = gm_cv.predict(X_test)
    print(f"MAE: {mean_absolute_error(y_test,y_pred)}")

    # Compute and print the metrics
    print(f"Tuned RF params: {gm_cv.best_params_}")
    print(f"Tuned RF score:  {gm_cv.score(X_test, y_test)}")

    # FIT model to tuned parameters
    rf_mod = RandomForestRegressor(
        criterion=err_metric,
        bootstrap=False,
        max_depth=gm_cv.best_params_['rf__max_depth'],
        max_features=gm_cv.best_params_['rf__max_features'],
        min_samples_leaf=gm_cv.best_params_['rf__min_samples_leaf'],
        n_estimators=gm_cv.best_params_['rf__n_estimators'],
        random_state=42)  # fix random state for reproducibility
    rf_mod.fit(X_train, y_train)
    y_pred = rf_mod.predict(X_test)

    # BEST FIT model - save
    from joblib import dump, load
    dump(rf_mod, 'bst_rf.joblib')
    # LOAD like this:
    # bst_rf = load('bst_rf.joblib')

    print(f"MAE: {mean_absolute_error(y_pred,y_test)}")
    print(f"R^2: {rf_mod.score(X_test,y_test)}")

    # PLOT importances
    print('Best model important features: SKLEARN')
    print(rf_mod.feature_importances_)
    fi = rf_imp(rf_mod, df.drop(flds, axis=1))
    plot_rf_imp(fi)

    if fi_plts:
        fiplt = fi.plot('Features',
                        'Importance',
                        'barh',
                        figsize=(12, 7),
                        legend=False)
        fig = fiplt.get_figure()
        fig.savefig(pth + str(title) + '_fi_skl.pdf', bbox_inches='tight')

    return y_train, y_pred, y_test  #,rf_mod
def get_RandSearchCV(X_train, y_train, X_test, y_test, scoring, type_search,
                     output_file):
    from sklearn.model_selection import TimeSeriesSplit
    from datetime import datetime as dt
    st_t = dt.now()
    # Numer of trees are used
    n_estimators = [5, 10, 50, 100, 150, 200, 250, 300]
    #n_estimators = list(np.arange(100,1000,50))
    #n_estimators = [1000]

    # Maximum depth of each tree
    max_depth = [5, 10, 25, 50, 75, 100]

    # Minimum number of samples per leaf
    min_samples_leaf = [1, 2, 4, 8, 10]

    # Minimum number of samples to split a node
    min_samples_split = [2, 4, 6, 8, 10]

    # Maximum numeber of features to consider for making splits
    max_features = ["auto", "sqrt", "log2", None]

    hyperparameter = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_leaf': min_samples_leaf,
        'min_samples_split': min_samples_split,
        'max_features': max_features
    }

    cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train)
    base_model_rf = RandomForestClassifier(criterion="gini", random_state=42)
    base_model_gb = GradientBoostingClassifier(criterion="friedman_mse",
                                               random_state=42)

    # Run randomzed search
    n_iter_search = 30
    if type_search == "RandomSearchCV-RandomForest":
        rsearch_cv = RandomizedSearchCV(estimator=base_model_rf,
                                        random_state=42,
                                        param_distributions=hyperparameter,
                                        n_iter=n_iter_search,
                                        cv=cv_timeSeries,
                                        scoring=scoring,
                                        n_jobs=-1)
    elif type_search == "RandomSearchCV-GradientBoosting":
        rsearch_cv = RandomizedSearchCV(estimator=base_model_gb,
                                        random_state=42,
                                        param_distributions=hyperparameter,
                                        n_iter=n_iter_search,
                                        cv=cv_timeSeries,
                                        scoring=scoring,
                                        n_jobs=-1)

    rsearch_cv.fit(X_train, y_train)
    #f = open("output.txt", "a")
    print("Best estimator obtained from CV data: \n",
          rsearch_cv.best_estimator_,
          file=output_file)
    print("Best Score: ", rsearch_cv.best_score_, file=output_file)
    return rsearch_cv
Esempio n. 30
0
Y_trainval, Y_test = Y[trainval_idx], Y[test_idx]
X_trainval, X_test = np.array(X.iloc[trainval_idx].tolist()), np.array(
    X.iloc[test_idx].tolist())

# AUC per political party
auc_parties = []
best_C = []
for partyname in partynames:

    y_trainval = [1 if x == partyname else 0 for x in Y_trainval]
    y_test = [1 if x == partyname else 0 for x in Y_test]

    # grid search LR
    param_search = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    model = LogisticRegression(penalty='l2', max_iter=7600)
    my_cv = TimeSeriesSplit(n_splits=5).split(X_trainval)
    gsearch = GridSearchCV(estimator=model,
                           cv=my_cv,
                           param_grid=param_search,
                           scoring='roc_auc')
    gsearch.fit(X_trainval, y_trainval)
    best_C.append(gsearch.best_params_)

    # prediction on test set
    pred = gsearch.predict_proba(
        X_test
    )  #Call predict_proba on the estimator with the best found parameters.
    score = roc_auc_score(y_test, pred[:, 1])

    auc_parties.append(score)
Esempio n. 31
0
def get_splits(X):
    splits = []
    tscv = TimeSeriesSplit(n_splits=3)
    for train_index, test_index in tscv.split(X):
        splits.append((train_index, test_index))
    return splits
Esempio n. 32
0
              'lag_attribute9']

train_val_sample.dropna(inplace=True)
train_val_sample.reset_index(drop=True,inplace=True)
testing_sample.dropna(inplace=True)
X_train=train_val_sample.drop(['y','date','device']+removal_list,1)
y_train=train_val_sample['y'].astype(int)
X_test=testing_sample.drop(['y','date','device']+removal_list,1)
y_test=testing_sample['y'].astype(int)

y_train.value_counts()

#I create 3 training samples and 3 validation samples. 


tscv=TimeSeriesSplit(n_splits=3)
print(tscv)

for train,test in tscv.split(X_train):
    print('%s %s' %(train,test))
    
################################################
#fit the model

#Cross validation and hyper-parameter search


print('running cross validation')

########################################
#XGBoost
Esempio n. 33
0
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

# **Save train targets into a separate vector.**

# In[ ]:

y_train = train_df['target'].astype('int').values

# **We'll be performing time series cross-validation, see `sklearn` [TimeSeriesSplit](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html) and [this dicussion](https://stats.stackexchange.com/questions/14099/using-k-fold-cross-validation-for-time-series-model-selection) on StackOverflow.**

# In[ ]:

time_split = TimeSeriesSplit(n_splits=10)

# <img src="https://habrastorage.org/webt/8i/5k/vx/8i5kvxrehatyvf-l3glz_-ymhtw.png" />

# In[ ]:

[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

# **Perform time series cross-validation with logistic regression.**

# In[ ]:

logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

# In[ ]:
Esempio n. 34
0
def ComputePermutationImportance(df_trn, use_columns, clf, eval_func):
    n_splits = 3
    n_runs = 5

    data = df_trn[use_columns]
    target = df_trn[target]

    imp_df = pd.DataFrame(np.ones((len(use_columns), n_splits * n_runs)),
                          index=use_columns)
    np.random.seed(9385610)
    idx = np.arange(len(target))
    for run in range(n_runs):
        # Shuffle target
        np.random.shuffle(idx)
        perm_target = target.iloc[idx]
        # Create a new split
        folds = TimeSeriesSplit(n_splits)
        # folds = StratifiedKFold(n_splits, shuffle=True, random_state=None)
        oof = np.empty(len(df_trn))

        for fold_, (trn_idx,
                    val_idx) in enumerate(folds.split(perm_target,
                                                      perm_target)):
            msg = "\rCompute permutation importance - run %d, fold %d ...      " % (
                run + 1, fold_ + 1)
            sys.stdout.write(msg)
            sys.stdout.flush()

            trn_dat, trn_tgt = data.iloc[trn_idx], perm_target.iloc[trn_idx]
            val_dat, val_tgt = data.iloc[val_idx], perm_target.iloc[val_idx]
            # Train classifier
            clf.fit(trn_dat, trn_tgt)
            # Keep feature importances for this fold and run
            fscore = clf.booster().get_score(importance_type='gain')
            fea = fscore.keys()
            imp = fscore.values()
            imp_df.loc[fea, n_splits * run + fold_] = imp
            # Update OOF for gini score display
            oof[val_idx] = clf.predict(val_dat)

        sys.stdout.write("done.\n")
        print("Run %2d OOF score : %.6f" % (run, eval_func(perm_target, oof)))

    bench_imp_df = pd.DataFrame(np.ones((len(use_columns), n_splits * n_runs)),
                                index=use_columns)
    idx = np.arange(len(target))
    n_choice = int(len(idx) * 0.8)
    for run in range(n_runs):
        # Shuffle target
        choice_idx = np.random.choice(idx, n_choice)
        perm_target = target.iloc[choice_idx]
        perm_data = data.iloc[choice_idx]

        # Create a new split
        folds = TimeSeriesSplit(n_splits)
        oof = np.empty(len(df_trn))

        for fold_, (trn_idx,
                    val_idx) in enumerate(folds.split(perm_target,
                                                      perm_target)):
            msg = "\rCompute bench importance - run %d, fold %d ...      " % (
                run + 1, fold_ + 1)
            sys.stdout.write(msg)
            sys.stdout.flush()

            trn_dat, trn_tgt = data.iloc[trn_idx], target.iloc[trn_idx]
            val_dat, val_tgt = data.iloc[val_idx], target.iloc[val_idx]
            # Train classifier
            clf.fit(trn_dat, trn_tgt)
            # Keep feature importances for this fold and run
            fscore = clf.booster().get_score(importance_type='gain')
            fea = fscore.keys()
            imp = fscore.values()
            bench_imp_df.loc[fea, n_splits * run + fold_] = imp
            # Update OOF for gini score display
            oof[val_idx] = clf.predict(val_dat)

        sys.stdout.write('done.\n')
        print("Run %2d OOF score : %.6f" % (run, eval_func(perm_target, oof)))

    bench_mean = bench_imp_df.mean(axis=1)
    perm_mean = imp_df.mean(axis=1)

    pvalues = pd.concat([bench_mean, perm_mean], axis=1).reset_index()
    pvalues.columns = ['feature', 'benchmark', 'permutation']
    pvalues['ratio'] = pvalues.benchmark / pvalues.permutation
    pvalues.sort_values(by='ratio', ascending=False, inplace=True)

    print("%-60s | benchmark | permutation | Ratio" % "Feature")
    for f, b, p, r in pvalues.values:
        print("%-60s |   %7.1f |     %7.1f |   %7.1f" % (f, b, p, r))

    return pvalues
def arima_gridsearch_cv(series, cv_splits=2,verbose=True,show_plots=True):
    # prepare train-test split object
    tscv = TimeSeriesSplit(n_splits=cv_splits)
    
    # initialize variables
    splits = []
    best_models = []
    all_models = []
    i = 1
    
    # loop through each CV split
    for train_index, test_index in tscv.split(series):
        print("*"*20)
        print("Iteration {} of {}".format(i,cv_splits))
        i = i + 1
        
        # print train and test indices
        if verbose:
            print("TRAIN:", train_index, "TEST:", test_index)
        splits.append({'train':train_index,'test':test_index})
        
        # split train and test sets
        train_series = series.ix[train_index]
        test_series = series.ix[test_index]
        
        print("Train shape:{}, Test shape:{}".format(train_series.shape,
              test_series.shape))
        
        # perform auto arima
        _best_model, _all_models = auto_arima(series=train_series)
        best_models.append(_best_model)
        all_models.append(_all_models)
        
        # display summary for best fitting model
        if verbose:
            print(_best_model['model_obj'].summary())
        results = _best_model['model_obj']
        
        if show_plots:
            # show residual plots
            residuals = pd.DataFrame(results.resid)
            residuals.plot()
            plt.title('Residual Plot')
            plt.show()
            residuals.plot(kind='kde')
            plt.title('KDE Plot')
            plt.show()
            print(residuals.describe())
        
            # show forecast plot
            fig, ax = plt.subplots(figsize=(18, 4))
            fig.autofmt_xdate()
            ax = train_series.plot(ax=ax)
            test_series.plot(ax=ax)
            fig = results.plot_predict(test_series.index.min(), 
                                       test_series.index.max(), 
                                       dynamic=True,ax=ax,
                                       plot_insample=False)
            plt.title('Forecast Plot ')
            plt.legend()
            plt.show()

            # show error plot
            insample_fit = list(results.predict(train_series.index.min()+1, 
                                                train_series.index.max(),
                                                typ='levels')) 
            plt.plot((np.exp(train_series.ix[1:].tolist())-\
                             np.exp(insample_fit)))
            plt.title('Error Plot')
            plt.show()
    return {'cv_split_index':splits,
            'all_models':all_models,
            'best_models':best_models}
Esempio n. 36
0
                    DataX1.append(Data)
                    DataY1.append(dataY[seq][0][pixel_i][pixel_j])

    return DataX1, DataY1, pixInd_X


#########################################################################################
##########################################################################################

seqLengthArr = [12]
monthAhead = [0]
features = [[11], [0], [0, 11], [10, 11], [0, 11, 1, 2], [0, 11, 1, 10]]

# features = [[0, 11]]
scv = TimeSeriesSplit(n_splits=3)
#######################################
param_grid = {
    'n_neighbors': [3, 5, 6, 7, 8, 9, 10],
    'leaf_size': [1, 2, 3, 5],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto'],
    'n_jobs': [-1]
}

##########################################

# scorer = make_scorer(mean_squared_error, greater_is_better=False)
scaler = StandardScaler()

estimator = KNeighborsRegressor()