def fit_predict(self, train, val=None, test=None, **kwa):
        model = QuantReg(train[1], train[0]).fit(q=0.5, max_iter=10000)

        if val is None:
            return model.predict(test[0])
        else:
            return model.predict(val[0]), model.predict(test[0])
def test_fitted_residuals():
    data = sm.datasets.engel.load_pandas().data
    y, X = dmatrices('foodexp ~ income', data, return_type='dataframe')
    res = QuantReg(y, X).fit(q=.1)
    # Note: maxabs relative error with fitted is 1.789e-09
    assert_almost_equal(np.array(res.fittedvalues), Rquantreg.fittedvalues, 5)
    assert_almost_equal(np.array(res.predict()), Rquantreg.fittedvalues, 5)
    assert_almost_equal(np.array(res.resid), Rquantreg.residuals, 5)
Beispiel #3
0
def test_fitted_residuals():
    data = sm.datasets.engel.load_pandas().data
    y, X = dmatrices('foodexp ~ income', data, return_type='dataframe')
    res = QuantReg(y, X).fit(q=.1)
    # Note: maxabs relative error with fitted is 1.789e-09
    assert_almost_equal(np.array(res.fittedvalues), Rquantreg.fittedvalues, 5)
    assert_almost_equal(np.array(res.predict()), Rquantreg.fittedvalues, 5)
    assert_almost_equal(np.array(res.resid), Rquantreg.residuals, 5)
Beispiel #4
0
def train_LAD(x, y):
    """
    训练LAD线性回归模型,并返回模型预测值
    """
    X = sm.add_constant(x)
    model = QuantReg(y, X)
    model = model.fit(q=0.5)
    re = model.predict(X)
    return re
Beispiel #5
0
class SkQuantReg:
    def __init__(self, tau):
        self.tau = tau

    def fit(self, X, y):
        self.m = QuantReg(y, X).fit(self.tau)
        return self

    def predict(self, X):
        return self.m.predict(X)
Beispiel #6
0
def train_predict_stacking_linear_regression(df_learning, df_prod,
                                             l_tuple_strategy_normalised):
    for quantile in constants.LIST_QUANTILE:
        to_keep = []
        for strategy, normalize_by in l_tuple_strategy_normalised:
            str_normalized = '_normed_by_' + normalize_by if normalize_by is not None else ''
            to_keep.append('{}{}_quantile_{:.3f}'.format(
                strategy, str_normalized, quantile))

        # Remove NA columns
        to_keep = df_learning[to_keep].notnull().all()
        to_keep = to_keep[to_keep].index.tolist()

        # We need to remove constants columns from the sampled data
        df_learning_weighted = df_learning.sample(10000,
                                                  weights='weight',
                                                  replace=True,
                                                  random_state=1)

        # Remove constants columns
        cols_constants = df_learning_weighted[to_keep].std() == 0
        cols_constants = cols_constants[cols_constants].index.tolist()
        for col in cols_constants:
            to_keep.remove(col)

        # # Remove correlated features
        # # Create correlation matrix
        # corr_matrix = df_learning[to_keep].corr().abs().fillna(1)

        # # Select upper triangle of correlation matrix
        # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # # Find index of feature columns with correlation greater than 0.95
        # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
        # to_keep.remove(to_drop)

        # Drop duplicates columns
        def getDuplicateColumns(df):
            '''
            Get a list of duplicate columns.
            It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
            :param df: Dataframe object
            :return: List of columns whose contents are duplicates.
            '''
            duplicateColumnNames = set()
            # Iterate over all the columns in dataframe
            for x in range(df.shape[1]):
                # Select column at xth index.
                col = df.iloc[:, x]
                # Iterate over all the columns in DataFrame from (x+1)th index till end
                for y in range(x + 1, df.shape[1]):
                    # Select column at yth index.
                    otherCol = df.iloc[:, y]
                    # Check if two columns at x 7 y index are equal
                    if col.equals(otherCol):
                        duplicateColumnNames.add(df.columns.values[y])

            return list(duplicateColumnNames)

        cols_duplicate = getDuplicateColumns(df_learning_weighted[to_keep])
        for cols in cols_duplicate:
            to_keep.remove(cols)

        # to_keep = df_learning_weighted[to_keep].T.drop_duplicates().T.columns  # Not efficient but ok

        X_learning_weighted = df_learning_weighted[to_keep].fillna(0)
        X_learning = df_learning[to_keep].fillna(0)
        X_prod = df_prod[to_keep].fillna(0)

        y_learning_weighted = df_learning_weighted['sales']
        # weight_learning = df_learning['weight']
        if X_learning_weighted.nunique().max() != 1:
            linear_model = QuantReg(y_learning_weighted, X_learning_weighted)
            linear_model = linear_model.fit(q=quantile)
            # print(linear_model.summary())
            df_learning['quantile_{:.3f}'.format(
                quantile)] = linear_model.predict(X_learning)
            df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
                X_prod)
        else:
            df_learning['quantile_{:.3f}'.format(quantile)] = 0
            df_prod['quantile_{:.3f}'.format(quantile)] = 0

    return df_learning, df_prod
Beispiel #7
0
def train_predict_lgb_tweedie(df_learning, df_prod, verbose_eval=75):
    """
    Args :
    - df_learning
    - df_prod

    Returns:
    - df_valid with quantile prediction and pinball loss
    - df_prod with quantile prediction
    """
    (
        df_learning,
        df_train,
        df_valid,
        df_valid_oof,
        X_learning,
        X_train,
        X_valid,
        X_valid_oof,
        X_prod,
        y_learning,
        y_train,
        y_valid,
        y_valid_oof,
        weight_learning,
        weight_train,
        weight_valid,
        weight_valid_oof,
        lgb_learning,
        lgb_train,
        lgb_valid,
    ) = prepare_data(df_learning, df_prod)

    param, num_boost_round, early_stopping_rounds = get_lgb_params(
        objective='tweedie', dataset_nrows=df_learning.shape[0])
    col_predict = 'pred'

    df_learning_pred, df_valid_pred, df_valid_oof, df_prod = train_predict_lgb(
        df_learning, df_valid, X_learning, X_valid, df_valid_oof, df_prod,
        X_valid_oof, X_prod, lgb_train, lgb_valid, lgb_learning, param,
        num_boost_round, early_stopping_rounds, verbose_eval, col_predict)

    from statsmodels.regression.quantile_regression import QuantReg

    df_learning_weighted = df_learning.sample(100000,
                                              weights='weight',
                                              replace=True)

    to_keep = ['pred', 'horizon']
    X_learning_weighted = df_learning_weighted[to_keep]
    X_learning = df_learning[to_keep]
    X_valid_oof = df_valid_oof[to_keep]
    X_prod = df_prod[to_keep]
    # y_learning = df_learning['sales']
    y_learning_weighted = df_learning_weighted['sales']

    for quantile in constants.LIST_QUANTILE:
        # QuantReg do not have weight parameter, so we mannualy reweight our datasets
        linear_model = QuantReg(y_learning_weighted, X_learning_weighted)
        linear_model = linear_model.fit(q=quantile)
        # print(linear_model.summary())
        df_learning['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_learning)
        df_valid_oof['quantile_{:.3f}'.format(
            quantile)] = linear_model.predict(X_valid_oof)
        df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_prod)

    df_valid_oof = prep.compute_pinball(df_valid_oof)

    return df_valid_oof, df_prod
Beispiel #8
0
def train_predict_lgb_point_to_uncertainity(df_learning, df_prod,
                                            verbose_eval):
    """
    Args :
    - df_learning
    - df_prod

    Returns:
    - df_valid with quantile prediction and pinball loss
    - df_prod with quantile prediction
    """
    (
        df_learning,
        df_train,
        df_valid,
        df_valid_oof,
        X_learning,
        X_train,
        X_valid,
        X_valid_oof,
        X_prod,
        y_learning,
        y_train,
        y_valid,
        y_valid_oof,
        weight_learning,
        weight_train,
        weight_valid,
        weight_valid_oof,
        lgb_learning,
        lgb_train,
        lgb_valid,
    ) = prepare_data(df_learning, df_prod)

    param, num_boost_round, early_stopping_rounds = get_lgb_params(
        objective='regression', dataset_nrows=df_learning.shape[0])
    col_predict = 'pred'

    df_learning_pred, df_valid_pred, df_valid_oof, df_prod = train_predict_lgb(
        df_learning, df_valid, X_learning, X_valid, df_valid_oof, df_prod,
        X_valid_oof, X_prod, lgb_train, lgb_valid, lgb_learning, param,
        num_boost_round, early_stopping_rounds, verbose_eval, col_predict)

    df_learning_weighted = pd.concat([df_valid_oof,
                                      df_valid_pred]).sample(100000,
                                                             weights='weight',
                                                             replace=True,
                                                             random_state=1)
    # If we fit QuantReg on overfitted prediction, QuantReg underestimate the security  needed
    # df_learning_weighted = df_learning.sample(100000, weights='weight', replace=True, random_state=1)

    to_keep = ['pred', 'horizon']
    X_learning_weighted = df_learning_weighted[to_keep]
    X_learning = df_learning[to_keep]
    X_valid_oof = df_valid_oof[to_keep]
    X_prod = df_prod[to_keep]
    # y_learning = df_learning['sales']
    y_learning_weighted = df_learning_weighted['sales']

    for quantile in constants.LIST_QUANTILE:
        # QuantReg do not have weight parameter, so we mannualy reweight our datasets
        linear_model = QuantReg(y_learning_weighted, X_learning_weighted)
        linear_model = linear_model.fit(q=quantile)
        # print(linear_model.summary())
        df_learning['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_learning)
        df_valid_oof['quantile_{:.3f}'.format(
            quantile)] = linear_model.predict(X_valid_oof)
        df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
            X_prod)

    df_valid_oof = prep.compute_pinball(df_valid_oof)

    return df_valid_oof, df_prod
train_y = Dataset.load_part('train', 'loss')
train_x = pd.read_csv('preds/%s-train.csv' % pred_name)['loss'].values

orig_maes = []
corr_maes = []

for fold, (fold_train_idx, fold_eval_idx) in enumerate(
        KFold(len(train_y), n_folds, shuffle=True, random_state=2016)):
    fold_train_x = train_x[fold_train_idx]
    fold_train_y = train_y[fold_train_idx]

    fold_eval_x = train_x[fold_eval_idx]
    fold_eval_y = train_y[fold_eval_idx]

    model = QuantReg(fold_train_y, fold_train_x).fit(q=0.5)

    fold_eval_p = model.predict(fold_eval_x)

    orig_maes.append(mean_absolute_error(fold_eval_y, fold_eval_x))
    corr_maes.append(mean_absolute_error(fold_eval_y, fold_eval_p))

    print("Fold %d, orig MAE = %.5f, corr MAE = %.5f" %
          (fold, orig_maes[-1], corr_maes[-1]))

print()
print("Avg orig MAE = %.5f" % np.mean(orig_maes))
print("Avg corr MAE = %.5f" % np.mean(corr_maes))

print("Done.")
Beispiel #10
0
class ForecastModelQR(ForecastModelBase):
    """
    QR预报模型
    """

    def constructModel(self):
        """
        QR比较特殊,无需构造模型,或者说它构造模型和训练是同时完成的,所以实现均在fit()方法中
        :return:
        """
        pass

    def fit(self):
        optimizedHyperParameters = self.optimizedHyperParameters
        fixedHyperParameters = self.fixedHyperParameters

        kernelName = optimizedHyperParameters["kernelName"]
        trainX, trainY, validationX, validationY = self.dataset.getDataset(2)
        self.model = QuantReg(trainY, trainX)

    def predict(self, validationX=None, isFlatten=False):
        if validationX is None:
            validationX = self.dataset.validationX
        optimizedHyperParameters = self.optimizedHyperParameters
        kernelName = optimizedHyperParameters["kernelName"]
        results = self.model.fit(q=0.5, kernel=kernelName)
        predictions = self.model.predict(params=results.params, exog=validationX)
        if isFlatten:
            predictions = predictions.flatten()
        self.dataset.validationD = predictions
        return predictions

    def getOptimizedHyperParametersRange(self):
        optimizedHyperParametersRange = {
            "kernelName": hp.choice("kernelName", ['epa', 'cos', 'gau', 'par']),
        }
        return optimizedHyperParametersRange

    def getDefaultOptimizedHyperParameters(self):
        optimizedHyperParameters = dict()
        # 核函数名称
        optimizedHyperParameters["kernelName"] = "epa"
        return optimizedHyperParameters

    def getDefaultFixedHyperParameters(self):
        fixedHyperParameters = dict()
        return fixedHyperParameters

    def getProbabilisticResults(self, probabilisticForecastModelParams, validationX=None):
        if validationX is None:
            validationX = self.dataset.validationX
        validSampleNum = validationX.shape[0]
        optimizedHyperParameters = self.optimizedHyperParameters
        kernelName = optimizedHyperParameters["kernelName"]

        # 刚好从0到1步长0.001,也恰好是1001个点
        F = np.arange(0, 1.001, 0.001)
        predictions = np.zeros(shape=(validSampleNum, len(F)))
        for i in range(len(F)):
            q = F[i]
            if 0 < q < 1:
                results = self.model.fit(q=q, kernel=kernelName)
                prediction = self.model.predict(params=results.params, exog=validationX)
                predictions[:, i] = prediction.T
        predictions[:, 0] = 2 * predictions[:, 1] - predictions[:, 2]
        predictions[:, -1] = 2 * predictions[:, -2] - predictions[:, -3]
        predictions.sort(axis=1)
        pdfs = []
        cdfs = []
        for i in range(validSampleNum):
            # 刚好从0到1步长0.001,也恰好是1001个点
            x = predictions[i, :]
            x = self.dataset.reverseLabel(x)
            c = dict()
            c["x"] = x
            c["F"] = F
            cdfs.append(c)

            # 已知概率密度函数PDF去求累计分布函数CDF,这是确定的过程
            # 已知CDF反求PDF,在PDF形式未知的情况下,根据所求方法采用的假设不同得到的PDF不同
            # 用面积定义来求,假设在散点很密的情况下,可以简化为小梯形面积或者小矩形面积,但这个假设不同会导致PDF形式差别很大
            # 也可以根据CDF分布来随机生成很多样本,再采用核密度估计方法也能得到PDF,总之取决于假设

            # 方法1:面积定义来求,假设小矩形,这个过程中推荐方法1
            xNew = np.linspace(x.min(), x.max(), len(x))
            y = MathInterpolateUtils.interp1d(x, F, xNew, kind="slinear")
            f = np.zeros(shape=x.shape)
            for j in range(1, len(f)):
                f[j] = (y[j] - y[j - 1]) / (xNew[j] - xNew[j - 1])
            x = xNew

            # 方法2:面积定义法,假设小梯形
            # f = np.zeros(shape=x.shape)
            # for j in range(1, len(F)):
            #     f[j] = 2 * (F[j] - F[j - 1]) / (x[j] - x[j - 1]) - f[j - 1]

            # 方法3:核密度估计
            # 首先需要针对CDF产生均匀分布的随机数,由于计算过程中分位数已经是均匀分布的了,所以可以直接对对应的x值进行估计
            # 方法3很费时,除了展示个别时段的PDF,整个过程中基本都在用CDF而不是PDF,所以在这个过程中不建议采用方法3
            # 只在专门展示PDF的服务里使用这个方法
            # paramGrid = {'bandwidth': np.arange(0, 5, 0.5)}
            # kde = KernelDensity(kernel='epanechnikov')
            # kdeGrid = GridSearchCV(estimator=kde, param_grid=paramGrid, cv=3)
            # kde = kdeGrid.fit(x.reshape(-1, 1)).best_estimator_
            # logDens = kde.score_samples(x.reshape(-1, 1))
            # f = np.exp(logDens)

            p = dict()
            p["x"] = x
            p["f"] = f
            pdfs.append(p)
        probabilisticResults = {
            "pdfs": np.array(pdfs),
            "cdfs": np.array(cdfs)
        }
        self.dataset.validationP = probabilisticResults
        return probabilisticResults