def test_zero_resid():
    # smoke and regression tests

    X = np.array([[1, 0], [0, 1], [0, 2.1], [0, 3.1]], dtype=np.float64)
    y = np.array([0, 1, 2, 3], dtype=np.float64)

    res = QuantReg(y, X).fit(0.5, bandwidth='chamberlain') #'bofinger')
    res.summary()

    assert_allclose(res.params, np.array([0.0,  0.96774163]), rtol=1e-4, atol=1e-20)
    assert_allclose(res.bse, np.array([0.0447576, 0.01154867]), rtol=1e-4, atol=1e-20)
    assert_allclose(res.resid, np.array([0.0,  3.22583680e-02,  -3.22574272e-02,
         9.40732912e-07]), rtol=1e-4, atol=1e-20)


    X = np.array([[1, 0], [0.1, 1], [0, 2.1], [0, 3.1]], dtype=np.float64)
    y = np.array([0, 1, 2, 3], dtype=np.float64)

    res = QuantReg(y, X).fit(0.5, bandwidth='chamberlain')
    res.summary()

    assert_allclose(res.params, np.array([9.99982796e-08, 9.67741630e-01]),
                    rtol=1e-4, atol=1e-20)
    assert_allclose(res.bse, np.array([0.04455029, 0.01155251]), rtol=1e-4, atol=1e-20)
    assert_allclose(res.resid, np.array([-9.99982796e-08, 3.22583598e-02,
                            -3.22574234e-02, 9.46361860e-07]), rtol=1e-4, atol=1e-20)
Ejemplo n.º 2
0
    def train(self, data, **kwargs):
        if self.indexer is not None and isinstance(data, pd.DataFrame):
            data = self.indexer.get_data(data)

        lagdata, ndata = lagmat(data, maxlag=self.order, trim="both", original='sep')

        mqt = QuantReg(ndata, lagdata).fit(0.5)
        if self.alpha is not None:
            uqt = QuantReg(ndata, lagdata).fit(1 - self.alpha)
            lqt = QuantReg(ndata, lagdata).fit(self.alpha)

        self.mean_qt = [k for k in mqt.params]
        if self.alpha is not None:
            self.upper_qt = [k for k in uqt.params]
            self.lower_qt = [k for k in lqt.params]

        if self.dist:
            self.dist_qt = []
            for alpha in np.arange(0.05,0.5,0.05):
                lqt = QuantReg(ndata, lagdata).fit(alpha)
                uqt = QuantReg(ndata, lagdata).fit(1 - alpha)
                lo_qt = [k for k in lqt.params]
                up_qt = [k for k in uqt.params]
                self.dist_qt.append([lo_qt, up_qt])

        self.shortname = "QAR(" + str(self.order) + ") - " + str(self.alpha)
Ejemplo n.º 3
0
def forecaster(returns, ff, loss='MSE'):

    output = []
    dates = sorted(list(ff.index))
    dataset = ff.merge(returns, left_index=True, right_index=True)
    columnNames = ['MktPremium', 'HML', 'Mom']
    name = returns.columns.tolist()[0]

    i = dates.index('200201')

    for j in range(i, (len(dates))):
        trainData = dataset.loc['199801':dates[j], :]
        trainX = trainData[columnNames]
        trainY = trainData[[name]]
        model = LinearRegression()
        if loss == 'MSE':
            model = LinearRegression()
        if loss == 'Ridge':
            model = Ridge()
        if loss == 'Lasso':
            model = Lasso()
        if loss == 'Hub':
            model = HuberRegressor()
        if loss == 'ElasticNet':
            model = ElasticNet()
        model.fit(trainX, trainY)
        testData = pd.DataFrame(dataset.loc[dates[j], :]).T
        testX = testData[columnNames]
        prediction = model.predict(testX)
        if loss == 'LAD':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.5)
            prediction = model.predict(res.params, exog=testX)
        if loss == '1Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.25)
            prediction = model.predict(res.params, exog=testX)
        if loss == '3Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.75)
            prediction = model.predict(res.params, exog=testX)

        if loss in ['Lasso', 'Hub', 'ElasticNet', 'LAD', '1Q', '3Q']:
            output.append(prediction[0])
        else:
            output.append(prediction[0][0])

    return (name, output)
    def fit_predict(self, train, val=None, test=None, **kwa):
        model = QuantReg(train[1], train[0]).fit(q=0.5, max_iter=10000)

        if val is None:
            return model.predict(test[0])
        else:
            return model.predict(val[0]), model.predict(test[0])
Ejemplo n.º 5
0
def calcuTD(x, O, Y, SeqDepth, Grid, Tau):
    TauGroup, D = Grid[x]
    D = int(D)

    try:
        polyX, centre, scale, alpha, beta = poly.poly(O, D)
    except Exception:
        polyX = None

    if polyX is not None:
        colVars = ['var_' + str(j) for j in range(D)]
        polydata = pd.concat([pd.DataFrame({'Y':Y}), pd.DataFrame(polyX, columns=colVars)], axis=1)
        try:
            rqfit = smf.quantreg('Y~' + '+'.join(colVars), polydata).fit(q=TauGroup)
            revX = poly.predict_poly(polyX, centre, scale, alpha, beta, SeqDepth)
            revX = pd.DataFrame(revX, columns=colVars)
            pdvalsrq = rqfit.predict(revX)

            if min(pdvalsrq) > 0:
                S = QuantReg(pdvalsrq.values, tools.add_constant(SeqDepth)).fit(q=Tau).params[1]
            else:
                S = -50
        except Exception:
            S = -50
    else:
        S = -50
    return S
def test_collinear_matrix():
    X = np.array([[1, 0, .5], [1, 0, .8],
                  [1, 0, 1.5], [1, 0, .25]], dtype=np.float64)
    y = np.array([0, 1, 2, 3], dtype=np.float64)

    res_collinear = QuantReg(y, X).fit(0.5)
    assert len(res_collinear.params) == X.shape[1]
Ejemplo n.º 7
0
    def fit(self):
        optimizedHyperParameters = self.optimizedHyperParameters
        fixedHyperParameters = self.fixedHyperParameters

        kernelName = optimizedHyperParameters["kernelName"]
        trainX, trainY, validationX, validationY = self.dataset.getDataset(2)
        self.model = QuantReg(trainY, trainX)
Ejemplo n.º 8
0
def test_fitted_residuals():
    data = sm.datasets.engel.load_pandas().data
    y, X = dmatrices('foodexp ~ income', data, return_type='dataframe')
    res = QuantReg(y, X).fit(q=.1)
    # Note: maxabs relative error with fitted is 1.789e-09
    assert_almost_equal(np.array(res.fittedvalues), Rquantreg.fittedvalues, 5)
    assert_almost_equal(np.array(res.predict()), Rquantreg.fittedvalues, 5)
    assert_almost_equal(np.array(res.resid), Rquantreg.residuals, 5)
Ejemplo n.º 9
0
 def setup_class(cls):
     data = sm.datasets.engel.load_pandas().data
     y, X = dmatrices('foodexp ~ income', data, return_type='dataframe')
     cls.res1 = QuantReg(y, X).fit(q=.75,
                                   vcov='iid',
                                   kernel='epa',
                                   bandwidth='hsheather')
     cls.res2 = epanechnikov_hsheather_q75
Ejemplo n.º 10
0
def test_use_t_summary():
    X = np.array([[1, 0], [0, 1], [0, 2.1], [0, 3.1]], dtype=np.float64)
    y = np.array([0, 1, 2, 3], dtype=np.float64)

    res = QuantReg(y, X).fit(0.5, bandwidth='chamberlain', use_t=True)
    summ = res.summary()
    assert 'P>|t|' in str(summ)
    assert 'P>|z|' not in str(summ)
Ejemplo n.º 11
0
def test_alpha_summary():
    X = np.array([[1, 0], [0, 1], [0, 2.1], [0, 3.1]], dtype=np.float64)
    y = np.array([0, 1, 2, 3], dtype=np.float64)

    res = QuantReg(y, X).fit(0.5, bandwidth='chamberlain', use_t=True)
    summ_20 = res.summary(alpha=.2)
    assert '[0.025      0.975]' not in str(summ_20)
    assert '[0.1        0.9]' in str(summ_20)
def test_nontrivial_singular_matrix():
    x_one = np.random.random(1000)
    x_two = np.random.random(1000)*10
    x_three = np.random.random(1000)
    intercept = np.ones(1000)

    y = np.random.random(1000)*5
    X = np.column_stack((intercept, x_one, x_two, x_three, x_one))

    assert np.linalg.matrix_rank(X) < X.shape[1]
    res_singular = QuantReg(y, X).fit(0.5)
    assert len(res_singular.params) == X.shape[1]
    assert np.linalg.matrix_rank(res_singular.cov_params()) == X.shape[1] - 1

    # prediction is correct even with singular exog
    res_ns = QuantReg(y, X[:, :-1]).fit(0.5)
    assert_allclose(res_singular.fittedvalues, res_ns.fittedvalues, rtol=0.01)
Ejemplo n.º 13
0
def train_LAD(x, y):
    """
    训练LAD线性回归模型,并返回模型预测值
    """
    X = sm.add_constant(x)
    model = QuantReg(y, X)
    model = model.fit(q=0.5)
    re = model.predict(X)
    return re
Ejemplo n.º 14
0
def get_quantreg(_y, what="slope", q=0.5):
    if not np.ma.is_masked(_y):    
        _x = sm.add_constant(np.arange(len(_y)))
        res=QuantReg(_y, _x).fit(q=0.5)
        if what=="slope":
            return res.params[1]
        elif what=="pval":
            return res.pvalues[1]
        elif what=="intercept":
            return res.params[0]
    else:
        return np.nan
Ejemplo n.º 15
0
def calcuSlope(i, LogData, SeqDepth, Genes, Tau):
    if i % round(len(Genes) / 10) == 0:
        print(i / round(len(Genes) / 10) * 10, '%')
    X = Genes[i]
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        validIdx = np.logical_not(np.isnan(
            LogData.loc[X].values)) & (SeqDepth.values > 0)
        mod = QuantReg(LogData.loc[X].values[validIdx],
                       tools.add_constant(np.log(SeqDepth.values[validIdx])))
        # mod = smf.quantreg('response ~ variable',
        #                   pd.DataFrame({'response': LogData.loc[X], 'variable': np.log(SeqDepth)}))
        slope = mod.fit(q=Tau).params[1]
    return slope
Ejemplo n.º 16
0
def plot_ar1_coef(series, granularity='30Min'):
    """
    plots ar1 coeff 
    input: pandas series of prices
    output: plots graph
    """
    # Groupby granularity
    series = series.groupby(
        pd.TimeGrouper(freq=granularity)).last().fillna(method='ffill')

    # returns and volatility
    ret = series.pct_change()
    vol = ret.rolling(vol_window).std()
    dff = pd.concat([ret, vol], axis=1, join='inner')
    dff.columns = ['ret', 'vol']

    # add constant
    dff = sm.add_constant(dff)

    # y-variable
    dff = dff.assign(y=dff.ret.shift(-1))

    # dropna
    dff.replace([np.inf, -np.inf], np.nan)
    dff.dropna(inplace=True)

    from statsmodels.regression.quantile_regression import QuantReg
    mod = QuantReg(endog=dff.y, exog=dff.loc[:, ['const', 'ret']])

    #
    quantiles = np.arange(.01, .99, .01)

    def fit_model(q):
        res = mod.fit(q=q)
        return [q, res.params['ret']] + \
                res.conf_int().loc['ret'].tolist()

    models = [fit_model(x) for x in quantiles]
    models = pd.DataFrame(models, columns=['q', 'b', 'lb', 'ub'])

    # plot the quantile regression params
    import matplotlib.pyplot as plt
    plt.title('AR1 Coefficient with {} granularity'.format(granularity))
    plt.plot(models.q, models.b, color='b', label='1st Order AutoRegression')
    plt.plot(models.q, models.ub, linestyle='dotted', color='b')
    plt.plot(models.q, models.lb, linestyle='dotted', color='b')
    #plt.plot(models.q, models.high_vol, color='red', label='High Volatility')
    plt.axhline(y=0, color='black', linestyle='--')
    plt.legend()
    plt.show()
"""Huber"""

reg2 = HuberRegressor(epsilon = 1)

model2 = reg2.fit(x, y)
y_pred2 = model2.predict(x_test)

"""L1"""

dfx = pd.DataFrame(x, columns = ['x'])
dfy = pd.DataFrame(y, columns = ['y'])
exog = sm.add_constant(dfx['x'])
endog = dfy['y']
dft = pd.DataFrame(x_test, columns = ['test'])

qrmodel = QuantReg(endog, exog)
result = qrmodel.fit(q=0.5)

ypred_qr = np.dot(dft, result.params[1]) + result.params[0] #results.predict(dft)

"""Student-t"""

tmodel = TLinearModel(endog, exog)
results = tmodel.fit(df=0.6)

ypred_t = np.dot(dft, results.params[1]) + results.params[0] #results.predict(dft)

"""Plot"""

plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
Ejemplo n.º 18
0
def forecaster(returns, ff, loss='MSE'):

    output = []
    factorLoadings = []
    varianceOfErrors = []
    df = ff.merge(returns, left_index=True, right_index=True)
    name = returns.columns.tolist()[0]
    df[name] = df[name] - df['RF']
    regressors = ['Mkt.Rf', 'HML', 'Mom', 'RMW', 'CMA']

    for j in range(120, len(df.index.tolist())):
        trainData = df.iloc[(j - 120):j, :]
        trainX = trainData[regressors]
        trainY = trainData[[name]]
        model = LinearRegression()
        if loss == 'MSE':
            model = LinearRegression()
        if loss == 'Ridge':
            model = Ridge()
        if loss == 'Lasso':
            model = Lasso()
        if loss == 'Hub':
            model = HuberRegressor()

        if True == trainY.isnull().values.any():
            output.append(np.nan)
            factorLoadings.append(np.zeros((1, 5)))
            varianceOfErrors.append(np.nan)
            continue

        model.fit(trainX, trainY)

        res = ''

        if loss == 'LAD':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.5)

        if loss == '1Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.25)

        if loss == '3Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.75)

        if loss in ['LAD', '1Q', '3Q']:
            factorLoadings.append(np.array(res.params))
        else:
            factorLoadings.append(model.coef_)

        if loss not in ['Lasso', 'Hub', 'LAD', '1Q', '3Q']:
            varianceOfErrors.append(
                np.var(trainY - model.predict(trainX)).tolist()[0])
        if loss in ['Lasso', 'Hub']:
            varianceOfErrors.append(
                np.var(np.array(trainY) - model.predict(trainX)))
        if loss in ['LAD', '1Q', '3Q']:
            varianceOfErrors.append(
                np.var(
                    model.predict(res.params, exog=trainX) - np.array(trainY)))

        testData = pd.DataFrame(df.iloc[j, :]).T
        testX = testData[regressors]

        if loss in ['LAD', '1Q', '3Q']:
            prediction = model.predict(res.params, exog=testX)
        else:
            prediction = model.predict(testX)

        if loss in ['Lasso', 'Hub', 'LAD', '1Q', '3Q']:
            output.append(prediction[0])
        else:
            output.append(prediction[0][0])

    return (name, output, factorLoadings, varianceOfErrors)
Ejemplo n.º 19
0
    import time
    xmin = [-1., -1.]
    xmax = [2., 3.]
    mu, invSig = ConstructRBF(xmin, xmax, [3, 3])

    t0 = time.time()
    data_x, data_f = GenerateSample(xmin,
                                    xmax,
                                    N_sample=300,
                                    Func=Func,
                                    NoiseFunc=NoiseFunc)
    print 'GenerateSample/Computation time:', time.time() - t0

    t0 = time.time()
    Theta = np.array([FeaturesNG(x, mu, invSig) for x in data_x])
    quant_reg = QuantReg(data_f, Theta)
    fit1 = quant_reg.fit(q=0.1)
    fit5 = quant_reg.fit(q=0.5)
    fit9 = quant_reg.fit(q=0.95)
    w1 = fit1.params
    w5 = fit5.params
    w9 = fit9.params
    print fit9.summary()
    print 'Parameters w1:', w1
    print 'Parameters w5:', w5
    print 'Parameters w9:', w9
    print 'QuantReg/Computation time:', time.time() - t0

    fp = file('/tmp/data.dat', 'w')
    for x, f in zip(data_x, data_f):
        fp.write('%f %f %f\n' % (x[0], x[1], f))
Ejemplo n.º 20
0
    #rankg2 = df['V3.%s' % first_min_second[1]].argsort().values
    #response = np.array((rankg1 - rankg2), dtype='d')# ** 3
    #response = MinMaxScaler().fit_transform(response[:, None])[:, 0]
    response = pd.DataFrame(response,
                            columns=['Rank%s-Rank%s' % first_min_second])

    explanatory = df[features].copy()
    #explanatory = pd.DataFrame(MinMaxScaler().fit_transform(explanatory.copy().values),
    #                      columns=explanatory.columns)
    #explanatory['intercept'] = np.ones(len(explanatory), dtype='d')
    explanatory['is_catole'] = np.array(df['bairro'] == 'catole', dtype='d')
    explanatory['is_centro'] = np.array(df['bairro'] == 'centro', dtype='d')
    explanatory['is_liberdade'] = np.array(df['bairro'] == 'liberdade',
                                           dtype='d')

    model = QuantReg(response, explanatory)
    max_left = 0.0
    max_left_q = 0
    max_right = 0.0
    max_right_q = 0

    rsqs = []
    qs = []
    util = []

    values = {}
    for name in explanatory.columns:
        values[name] = np.zeros(10)
        i = 0
        for q in np.linspace(0.05, 0.95, 5):
            values[name][i] = 0
Ejemplo n.º 21
0
def train_predict_stacking_linear_regression(df_learning, df_prod,
                                             l_tuple_strategy_normalised):
    for quantile in constants.LIST_QUANTILE:
        to_keep = []
        for strategy, normalize_by in l_tuple_strategy_normalised:
            str_normalized = '_normed_by_' + normalize_by if normalize_by is not None else ''
            to_keep.append('{}{}_quantile_{:.3f}'.format(
                strategy, str_normalized, quantile))

        # Remove NA columns
        to_keep = df_learning[to_keep].notnull().all()
        to_keep = to_keep[to_keep].index.tolist()

        # We need to remove constants columns from the sampled data
        df_learning_weighted = df_learning.sample(10000,
                                                  weights='weight',
                                                  replace=True,
                                                  random_state=1)

        # Remove constants columns
        cols_constants = df_learning_weighted[to_keep].std() == 0
        cols_constants = cols_constants[cols_constants].index.tolist()
        for col in cols_constants:
            to_keep.remove(col)

        # # Remove correlated features
        # # Create correlation matrix
        # corr_matrix = df_learning[to_keep].corr().abs().fillna(1)

        # # Select upper triangle of correlation matrix
        # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        # # Find index of feature columns with correlation greater than 0.95
        # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
        # to_keep.remove(to_drop)

        # Drop duplicates columns
        def getDuplicateColumns(df):
            '''
            Get a list of duplicate columns.
            It will iterate over all the columns in dataframe and find the columns whose contents are duplicate.
            :param df: Dataframe object
            :return: List of columns whose contents are duplicates.
            '''
            duplicateColumnNames = set()
            # Iterate over all the columns in dataframe
            for x in range(df.shape[1]):
                # Select column at xth index.
                col = df.iloc[:, x]
                # Iterate over all the columns in DataFrame from (x+1)th index till end
                for y in range(x + 1, df.shape[1]):
                    # Select column at yth index.
                    otherCol = df.iloc[:, y]
                    # Check if two columns at x 7 y index are equal
                    if col.equals(otherCol):
                        duplicateColumnNames.add(df.columns.values[y])

            return list(duplicateColumnNames)

        cols_duplicate = getDuplicateColumns(df_learning_weighted[to_keep])
        for cols in cols_duplicate:
            to_keep.remove(cols)

        # to_keep = df_learning_weighted[to_keep].T.drop_duplicates().T.columns  # Not efficient but ok

        X_learning_weighted = df_learning_weighted[to_keep].fillna(0)
        X_learning = df_learning[to_keep].fillna(0)
        X_prod = df_prod[to_keep].fillna(0)

        y_learning_weighted = df_learning_weighted['sales']
        # weight_learning = df_learning['weight']
        if X_learning_weighted.nunique().max() != 1:
            linear_model = QuantReg(y_learning_weighted, X_learning_weighted)
            linear_model = linear_model.fit(q=quantile)
            # print(linear_model.summary())
            df_learning['quantile_{:.3f}'.format(
                quantile)] = linear_model.predict(X_learning)
            df_prod['quantile_{:.3f}'.format(quantile)] = linear_model.predict(
                X_prod)
        else:
            df_learning['quantile_{:.3f}'.format(quantile)] = 0
            df_prod['quantile_{:.3f}'.format(quantile)] = 0

    return df_learning, df_prod
Ejemplo n.º 22
0
#for f1, f2 in itertools.combinations(orig.columns.copy(), 2):
#    prod = orig[f1].values * orig[f2].values
#    orig[f1 + '_times_' + f2] = prod

orig['is_catole'] = np.array(df['bairro'] == 'catole', dtype='d')
orig['is_centro'] = np.array(df['bairro'] == 'centro', dtype='d')
orig['is_liberdade'] = np.array(df['bairro'] == 'liberdade', dtype='d')

scaled = pd.DataFrame(StandardScaler().fit_transform(orig.copy().values),
                      columns=orig.columns)
print(orig.shape)
assert orig.shape == scaled.shape

# In[5]:

model = QuantReg(response, orig)

# In[6]:

for q in np.linspace(0.05, 0.95, 10):
    print(q)
    print(model.fit(q=q).summary())
    print()
    print()

# In[ ]:

# In[ ]:

# In[ ]:
Ejemplo n.º 23
0
def QuantileRegression(X, Y, quantile):
    mod = QuantReg(Y, X)
    res = mod.fit(q=quantile)
    return res.params
Ejemplo n.º 24
0
    def run(self):
        """
        Build the POD models.

        Notes
        -----
        This method build the quantile regression model. First the censored data
        are filtered if needed. The Box Cox transformation is performed if it is
        enabled. Then it builds the POD model for given data and computes using
        bootstrap all the defects quantile needed to build the POD model at the
        confidence level.
        """

        # Run the preliminary run of the POD class
        result = self._run(self._inputSample, self._outputSample, self._detection,
                           self._noiseThres, self._saturationThres, self._boxCox,
                           self._censored)

        # get some results
        self._defects = result['inputSample']
        self._signals = result['signals']
        self._detectionBoxCox = result['detectionBoxCox']

        defectsSize = self._defects.getSize()

        # create the quantile regression object
        X = ot.NumericalSample(defectsSize, [1, 0])
        X[:, 1] = self._defects
        self._algoQuantReg = QuantReg(np.array(self._signals), np.array(X))

        # Compute the defect quantile
        defectMax = self._defects.getMax()[0]
        defectList = []
        for probLevel in self._quantile:
            # fit the quantile regression and return the NMF
            model = self._buildModel(1. - probLevel)
            # Solve the model == detectionBoxCox with defects 
            # boundaries = [0, defectMax]
            defectList.append(ot.Brent().solve(model, self._detectionBoxCox,
                                               0, defectMax))
        # create support of the interpolating function including
        # point (0, 0) and point (defectMax, max(quantile))
        xvalue = np.hstack([0, defectList, defectMax])
        yvalue = np.hstack([0., self._quantile, self._quantile.max()])
        interpModel = interp1d(xvalue, yvalue, kind='linear')
        self._PODmodel = ot.PythonFunction(1, 1, interpModel)


        ############ Confidence interval with bootstrap ########################
        # Compute a NsimulationSize defect sizes for all quantiles
        data = ot.NumericalSample(self._size, 2)
        data[:, 0] = self._inputSample
        data[:, 1] = self._outputSample
        # bootstrap of the data
        bootstrapExp = ot.BootstrapExperiment(data)
        # create a numerical sample which contains for all simulations the 
        # defect quantile value. The goal is to compute the QuantilePerComponent
        # of the simulation for each defect quantile (columns)
        self._defectsPerQuantile = ot.NumericalSample(self._simulationSize, self._quantile.size)
        for i in range(self._simulationSize):
            # generate a sample with replacement within data of the same size
            bootstrapData = bootstrapExp.generate()
            # run the preliminary analysis : censore checking and box cox
            result = self._run(bootstrapData[:,0], bootstrapData[:,1], self._detection,
                               self._noiseThres, self._saturationThres,
                               self._boxCox, self._censored)

            # get some results
            defects = result['inputSample']
            signals = result['signals']
            detectionBoxCox = result['detectionBoxCox']
            defectsSize = defects.getSize()

            # new quantile regression algorithm
            X = ot.NumericalSample(defectsSize, [1, 0])
            X[:, 1] = defects
            algoQuantReg = QuantReg(np.array(signals), np.array(X))

            # compute the quantile defects
            defectMax = defects.getMax()[0]
            defectList = []
            for probLevel in self._quantile:
                fit = algoQuantReg.fit(1. - probLevel, max_iter=300, p_tol=1e-2)
                def model(x):
                    X = ot.NumericalPoint([1, x[0]])
                    return ot.NumericalPoint(fit.predict(X))
                model = ot.PythonFunction(1, 1, model)
                # Solve the model == detectionBoxCox with defects 
                # boundaries = [-infinity, defectMax] : it allows negative defects
                # when for small prob level, there is no intersection with
                # the detection threshold for positive defects
                defectList.append(ot.Brent().solve(model, detectionBoxCox,
                                                   -ot.SpecFunc.MaxNumericalScalar,
                                                   defectMax))
            # add the quantile in the numerical sample as the ith simulation
            self._defectsPerQuantile[i, :] = defectList
            if self._verbose:
                updateProgress(i, self._simulationSize, 'Computing defect quantile')
Ejemplo n.º 25
0
 def fit(self, X, y):
     self.model_ = QuantReg(y, smapi.add_constant(X))
     self.model_result_ = self.model_.fit(q=self.q)
     return self
Ejemplo n.º 26
0
    def fit(self, X, y):

        X = self.preprocess(X)

        self.regressor = QuantReg(y, X)
        self.regressor_fit = self.regressor.fit(q=self.quantile)
Ejemplo n.º 27
0
def setup_fun(kernel='gau', bandwidth='bofinger'):
    data = sm.datasets.engel.load_pandas().data
    y, X = dmatrices('foodexp ~ income', data, return_type='dataframe')
    statsm = QuantReg(y, X).fit(vcov='iid', kernel=kernel, bandwidth=bandwidth)
    stata = d[(kernel, bandwidth)]
    return statsm, stata
Ejemplo n.º 28
0
    def fit(self,X,*args,**kwargs):
        
        """
        Fit a projection pursuit dimension reduction model. 
        
        Required input argument: X data as matrix or data frame 
        
        Optinal input arguments: 
            
            arg or kwarg:
            y data as vector or 1D matrix
            
            kwargs: 
            h, int: option to overrule class's n_components parameter in fit. 
                Convenient command line, yet should not be used in automated 
                loops, e.g. cross-validation.
                
            dmetric, str: distance metric used internally. Defaults to 'euclidean'
            
            mixing, bool: to estimate mixing matrix (only relevant for ICA)
            
            Further parameters to the regression methods can be passed on 
            here as well as kwargs, e.g. quantile=0.8 for quantile regression. 
            
            kwargs only relevant if y specified: 
        
        """

        # Collect optional fit arguments
        biascorr = kwargs.pop('biascorr',False)
            
        if 'h' not in kwargs:
            h = self.n_components
        else:
            h = kwargs.pop('h')
            self.n_components = h
            
        if 'dmetric' not in kwargs:
            dmetric = 'euclidean'
        else:
            dmetric = kwargs.get('dmetric')
            
        if 'mixing' not in kwargs:
            mixing = False
        else:
            mixing = kwargs.get('mixing')
            
        if 'y' not in kwargs:
            na = len(args)
            if na > 0: #Use of *args makes it sklearn consistent
                flag = 'two-block'
                y = args[0]
            else:
                flag = 'one-block'
                y = 0 # to allow calls with 'y=y' in spit of no real y argument present
        else:
            flag = 'two-block'
            y = kwargs.get('y')
                            
            if 'quantile' not in kwargs:
                quantile = .5
            else:
                quantile = kwargs.get('quantile')
                
            if self.regopt == 'robust':
            
                if 'fun' not in kwargs:
                    fun = 'Hampel'
                else:
                    fun = kwargs.get('fun')
                
                if 'probp1' not in kwargs:
                    probp1 = 0.95
                else:
                    probp1 = kwargs.get('probp1')
                
                if 'probp2' not in kwargs:
                    probp2 = 0.975
                else:
                    probp2 = kwargs.get('probp2')
                
                if 'probp3' not in kwargs:
                    probp3 = 0.99
                else:
                    probp3 = kwargs.get('probp3')

            
        if self.projection_index == dicomo:
            
            if self.pi_arguments['mode'] in ('M3','cos','c*k'):
            
                if 'option' not in kwargs:
                    option = 1
                else:
                    option = kwargs.get('option')
                
                if option > 3:
                    print('Option value >3 will compute results, but meaning may be questionable')
                
        # Initiate projection index    
        self.most = self.projection_index(**self.pi_arguments)         
        
        # Initiate some parameters and data frames
        if self.copy:
            X0 = copy.deepcopy(X)
            self.X0 = X0
        else:
            X0 = X        
        X = convert_X_input(X0)    
        n,p = X0.shape 
        trimming = self.trimming
        
        # Check dimensions 
        if h > min(n,p):
            raise(MyException('number of components cannot exceed number of samples'))
            
        if (self.projection_index == dicomo and self.pi_arguments['mode'] == 'kurt' and self.whiten_data==False):
            warnings.warn('Whitening step is recommended for ICA')
            
        # Pre-processing adjustment if whitening
        if self.whiten_data:
            self.center_data = True
            self.scale_data = False
            self.compression = False
            print('All results produced are for whitened data')
        
        # Centring and scaling
        if self.scale_data:
            if self.center=='mean':
                scale = 'std'
            elif ((self.center=='median')|(self.center=='l1median')):
                scale = 'mad' 
        else:
            scale = 'None'
            warnings.warn('Without scaling, convergence to optima is not given')
            
         # Data Compression for flat tables if required                
        if ((p>n) and self.compression):
            V,S,U = np.linalg.svd(X.T,full_matrices=False)
            X = np.matmul(U.T,np.diag(S))
            n,p = X.shape
            
            if (srs.mad(X)==0).any(): 
                warnings.warn('Due to low scales in data, compression would induce zero scales.' 
                              + '\n' + 'Proceeding without compression.')
                dimensions = False
                if copy:
                    X = copy.deepcopy(X0)
                else:
                    X = X0
            else:
                dimensions = True
        else:
            dimensions = False
        
        # Initiate centring object and scale X data 
        centring = VersatileScaler(center=self.center,scale=scale,trimming=trimming)      
  
        if self.center_data:
            Xs = centring.fit_transform(X)
            mX = centring.col_loc_
            sX = centring.col_sca_
        else:
            Xs = X
            mX = np.zeros((1,p))
            sX = np.ones((1,p))

        fit_arguments = {}
            
        # Data whitening (best practice for ICA)
        if self.whiten_data:
            V,S,U = np.linalg.svd(Xs.T,full_matrices=False)
            del U
            K = (V/S)[:,:p]
            del V,S
            Xs = np.matmul(Xs, K)
            Xs *= np.sqrt(p)
        
        # Presently, X and y need to be matrices 
        # Will be changed to use regular np.ndarray
        Xs = np.matrix(Xs)

        # Pre-process y data when available 
        if flag != 'one-block':
            
            ny = y.shape[0]
            y = convert_y_input(y)
            if len(y.shape) < 2:
                y = np.matrix(y).reshape((ny,1))
            # py = y.shape[1]
            if ny != n:
                raise(MyException('X and y number of rows must agree'))
            if self.copy:
                y0 = copy.deepcopy(y)
                self.y0 = y0
                
            if self.center_data:
                ys = centring.fit_transform(y)
                my = centring.col_loc_
                sy = centring.col_sca_ 
            else:
                ys = y
                my = 0
                sy = 1
            ys = np.matrix(ys).astype('float64')
        
        else:
            ys = None
                

        # Initializing output matrices
        W = np.zeros((p,h))
        T = np.zeros((n,h))
        P = np.zeros((p,h))
        B = np.zeros((p,h))
        R = np.zeros((p,h))
        B_scaled = np.zeros((p,h))
        C = np.zeros((h,1))
        Xev = np.zeros((h,1))
        assovec = np.zeros((h,1))
        Maxobjf = np.zeros((h,1))

        # Initialize deflation matrices 
        E = copy.deepcopy(Xs)
        f = ys

        bi = np.zeros((p,1))
        
        opt_args = { 
                    'alpha': self.alpha,
                    'trimming': self.trimming,
                    'biascorr': biascorr, 
                    'dmetric' : 'euclidean',
                    }
        
        if self.optimizer=='grid':
            # Define grid optimization ranges
            if 'ndir' not in self.optimizer_options:
                self.optimizer_options['ndir'] = 1000
            optrange = np.sign(self.optrange)
            optmax = self.optrange[1]
            stop0s = np.arcsin(optrange[0])
            stop1s = np.arcsin(optrange[1])
            stop1c = np.arccos(optrange[0])
            stop0c = np.arccos(optrange[1])
            anglestart = max(stop0c,stop0s)
            anglestop = max(stop1c,stop1s)
            nangle = np.linspace(anglestart,anglestop,self.optimizer_options['ndir'],endpoint=False)            
            alphamat = np.matrix([np.cos(nangle), np.sin(nangle)])
            opt_args['_stop0c'] = stop0c
            opt_args['_stop0s'] = stop0s
            opt_args['_stop1c'] = stop1c
            opt_args['_stop1s'] = stop1s
            opt_args['optmax'] = optmax
            opt_args['optrange'] = self.optrange
            opt_args['square_pi'] = self.square_pi
            if optmax != 1:
                alphamat *= optmax
        
            if p>2:
                anglestart = min(opt_args['_stop0c'],opt_args['_stop0s'])
                anglestop = min(opt_args['_stop1c'],opt_args['_stop1s'])
                nangle = np.linspace(anglestart,anglestop,self.optimizer_options['ndir'],endpoint=True)
                alphamat2 = np.matrix([np.cos(nangle), np.sin(nangle)])
                if optmax != 1:
                    alphamat2 *= opt_args['optmax']
                
            # Arguments for grid plane
            opt_args['alphamat'] = alphamat,
            opt_args['ndir'] = self.optimizer_options['ndir'],
            opt_args['maxiter'] = self.optimizer_options['maxiter']
            if type(opt_args['ndir'] is tuple): 
                opt_args['ndir'] = opt_args['ndir'][0]
            
            # Arguments for grid plane #2
            grid_args_2 = { 
                     'alpha': self.alpha,
                     'alphamat': alphamat2,
                     'ndir': self.optimizer_options['ndir'],
                     'trimming': self.trimming,
                     'biascorr': biascorr, 
                     'dmetric' : 'euclidean',
                     '_stop0c' : stop0c,
                     '_stop0s' : stop0s,
                     '_stop1c' : stop1c,
                     '_stop1s' : stop1s,
                     'optmax' : optmax,
                     'optrange' : self.optrange,
                     'square_pi' : self.square_pi
                     }
            if flag=='two-block':
                grid_args_2['y'] = f
        
        if flag=='two-block':
            opt_args['y'] = f
            

        # Itertive coefficient estimation
        for i in range(0,h):

            if self.optimizer=='grid':
                if p==2:
                    wi,maximo = gridplane(E,self.most,
                                          pi_arguments=opt_args
                                          )
           
                elif p>2:
                
                    afin = np.zeros((p,1)) # final parameters for linear combinations
                    Z = copy.deepcopy(E)
                    # sort variables according to criterion
                    meas = [self.most.fit(E[:,k],
                            **opt_args) 
                            for k in np.arange(0,p)]
                    if self.square_pi:
                        meas = np.square(meas)
                    wi,maximo = gridplane(Z[:,0:2],self.most,opt_args)
                    Zopt = Z[:,0:2]*wi 
                    afin[0:2]=wi
                    for j in np.arange(2,p):
                        projmat = np.matrix([np.array(Zopt[:,0]).reshape(-1),
                                         np.array(Z[:,j]).reshape(-1)]).T
                        wi,maximo = gridplane(projmat,self.most,
                                              opt_args
                                              )
                        Zopt = Zopt*float(wi[0]) + Z[:,j]*float(wi[1])
                        afin[0:(j+1)] = afin[0:(j+1)]*float(wi[0])
                        afin[j] = float(wi[1])

                    tj = Z*afin
                    objf = self.most.fit(tj,
                                     **{**fit_arguments,**opt_args}
                                    )
                    if self.square_pi:
                        objf *= objf
    

                    # outer loop to run until convergence
                    objfold = copy.deepcopy(objf)
                    objf = -1000
                    afinbest = afin
                    ii = 0
                    maxiter_2j = 2**round(np.log2(self.optimizer_options['maxiter'])) 
                
                    while ((ii < self.optimizer_options['maxiter'] + 1) and (abs(objfold - objf)/abs(objf) > 1e-4)):
                        for j in np.arange(0,p):
                            projmat = np.matrix([np.array(Zopt[:,0]).reshape(-1),
                                         np.array(Z[:,j]).reshape(-1)]).T
                            if j > 16:
                                divv = maxiter_2j
                            else:
                                divv = min(2**j,maxiter_2j)
                        
                            wi,maximo = gridplane_2(projmat,
                                                    self.most,
                                                    q=afin[j],
                                                    div=divv,
                                                    pi_arguments=grid_args_2
                                                    )
                            Zopt = Zopt*float(wi[0,0]) + Z[:,j]*float(wi[1,0])
                            afin *= float(wi[0,0])
                            afin[j] += float(wi[1,0])
                        
                        # % evaluate the objective function:
                        tj = Z*afin
                    
                        objfold = copy.deepcopy(objf)
                        objf = self.most.fit(tj,
                                         q=afin,
                                         **opt_args
                                         )
                        if self.square_pi:
                            objf *= objf
                    
                        if  objf!=objfold:
                            if self.constraint == 'norm':
                                afinbest = afin/np.sqrt(np.sum(np.square(afin)))
                            else:
                                afinbest = afin
                            
                        ii +=1
                        if self.verbose:
                            print(str(ii))
                    #endwhile
                
                    afinbest = afin
                    wi = np.zeros((p,1))
                    wi = afinbest
                    Maxobjf[i] = objf
                # endif;%if p>2;
            else: # do not optimize by the grid algorithm
                if self.trimming > 0: 
                    warnings.warn('Optimization that involves a trimmed objective is not a quadratic program. The scipy-optimize result will be off!!')
                if 'center' in self.pi_arguments:
                    if (self.pi_arguments['center']=='median'): 
                        warnings.warn('Optimization that involves a median in the objective is not a quadratic program. The scipy-optimize result will be off!!')   
                constraint = {'type':'eq',
                              'fun': lambda x: np.linalg.norm(x) -1,
                              }
                if len(self.optimizer_constraints)>0: 
                    constraint = [constraint,self.optimizer_constraints]
                wi = minimize(pp_objective,
                              E[0,:].transpose(),
                              args=(self.most,E,opt_args),
                              method=self.optimizer,
                              constraints=constraint,
                              options=self.optimizer_options).x
                wi = np.matrix(wi).reshape((p,1))
                wi /= np.sqrt(np.sum(np.square(wi)))
                
                
            # Computing projection weights and scores
            ti = E*wi
            if self.optimizer != 'grid':
                Maxobjf[i] = self.most.fit(E*wi,**opt_args)
            nti = np.linalg.norm(ti)
            pi = E.T*ti / (nti**2)
            if self.whiten_data:
                wi /= np.sqrt((wi**2).sum())
                wi = K*wi
            wi0 = wi
            wi = np.array(wi)
            if len(W[:,i].shape) == 1:
                wi = wi.reshape(-1)
            W[:,i] = wi
            T[:,i] = np.array(ti).reshape(-1)
            P[:,i] = np.array(pi).reshape(-1)
            
            if flag != 'one-block':
                criteval = self.most.fit(E*wi0,
                                         **opt_args
                                         )
                if self.square_pi:
                    criteval *= criteval
                    
                assovec[i] = criteval
                

            # Deflation of the datamatrix guaranteeing orthogonality restrictions
            E -= ti*pi.T
 
            # Calculate R-Weights
            R = np.dot(W[:,0:(i+1)],pinv2(np.dot(P[:,0:(i+1)].T,W[:,0:(i+1)]),check_finite=False))
        
            # Execute regression y~T if y is present. Generate regression estimates.
            if flag != 'one-block':
                if self.regopt=='OLS':
                    ci = np.dot(ti.T,ys)/(nti**2)
                elif self.regopt == 'robust':
                    linfit = rm(fun=fun,probp1=probp1,probp2=probp2,probp3=probp3,
                                centre=self.center,scale=scale,
                                start_cutoff_mode='specific',verbose=self.verbose)
                    linfit.fit(ti,ys)
                    ci = linfit.coef_
                elif self.regopt == 'quantile':
                    linfit = QuantReg(y,ti)
                    model = linfit.fit(q=quantile)
                    ci = model.params
                # end regression if
                
                C[i] = ci
                bi = np.dot(R,C[0:(i+1)])
                bi_scaled = bi
                bi = np.multiply(np.reshape(sy/sX,(p,1)),bi)
                B[:,i] = bi[:,0]
                B_scaled[:,i] = bi_scaled[:,0]

        # endfor; Loop for latent dimensions

        # Re-adjust estimates to original dimensions if data have been compressed 
        if dimensions:
            B = np.matmul(V[:,0:p],B)
            B_scaled = np.matmul(V[:,0:p],B_scaled)
            R = np.matmul(V[:,0:p],R)
            W = np.matmul(V[:,0:p],W)
            P = np.matmul(V[:,0:p],P)
            bi = B[:,h-1]
            if self.center_data:
                Xs = centring.fit_transform(X0)
                mX = centring.col_loc_
                sX = centring.col_sca_
            else:
                Xs = X0
                mX = np.zeros((1,p))
                sX = np.ones((1,p))
        
        bi = bi.astype("float64")
        if flag != 'one-block':            
            # Calculate scaled and unscaled intercepts
            if dimensions:
                X = convert_X_input(X0)
            if(self.center == "mean"):
                intercept = sps.trim_mean(y - np.matmul(X,bi),trimming)
            else:
                intercept = np.median(np.reshape(y - np.matmul(X,bi),(-1)))
            yfit = np.matmul(X,bi) + intercept
            if not(scale == 'None'):
                if (self.center == "mean"):
                    b0 = np.mean(ys - np.matmul(Xs.astype("float64"),bi))
                else:
                    b0 = np.median(np.array(ys.astype("float64") - np.matmul(Xs.astype("float64"),bi)))
            else:
                b0 = intercept
            
            # Calculate fit values and residuals 
            yfit = yfit    
            r = y - yfit
            setattr(self,"coef_",B)
            setattr(self,"intercept_",intercept)
            setattr(self,"coef_scaled_",B_scaled)
            setattr(self,"intercept_scaled_",b0)
            setattr(self,"residuals_",r)
            setattr(self,"fitted_",yfit)
            setattr(self,"y_loadings_",C)
            setattr(self,"y_loc_",my)
            setattr(self,"y_sca_",sy)
                
        setattr(self,"x_weights_",W)
        setattr(self,"x_loadings_",P)
        setattr(self,"x_rotations_",R)
        setattr(self,"x_scores_",T)
        setattr(self,"x_ev_",Xev)
        setattr(self,"crit_values_",assovec)
        setattr(self,"Maxobjf_",Maxobjf)
        
        if self.whiten_data:
            setattr(self,"whitening_",K)

        
        if mixing:
            setattr(self,"mixing_",np.linalg.pinv(W))
        
        
        setattr(self,"x_loc_",mX)
        setattr(self,"x_sca_",sX)

        setattr(self,'scaling',scale)
        if self.return_scaling_object:
            setattr(self,'scaling_object_',centring)
        
        return(self)   
from scipy import stats
import statsmodels.api as sm

from statsmodels.regression.quantile_regression import QuantReg

sige = 0.1
nobs, k_vars = 500, 3
x = np.random.uniform(-1, 1, size=nobs)
x.sort()
exog = np.vander(x, k_vars + 1)[:, ::-1]
mix = 0.1 * stats.norm.pdf(
    x[:, None], loc=np.linspace(-0.5, 0.75, 4), scale=0.01).sum(1)
y = exog.sum(1) + mix + sige * (np.random.randn(nobs) / 2 + 1)**3

p = 0.5
res_qr = QuantReg(y, exog).fit(p)
res_qr2 = QuantReg(y, exog).fit(0.1)
res_qr3 = QuantReg(y, exog).fit(0.75)
res_ols = sm.OLS(y, exog).fit()

params = [res_ols.params, res_qr2.params, res_qr.params, res_qr3.params]
labels = ['ols', 'qr 0.1', 'qr 0.5', 'qr 0.75']

plt.figure()
plt.plot(x, y, '.', alpha=0.5)
for lab, beta in zip(['ols', 'qr 0.1', 'qr 0.5', 'qr 0.75'], params):
    print('%-8s' % lab, np.round(beta, 4))
    fitted = np.dot(exog, beta)
    lw = 2
    plt.plot(x, fitted, lw=lw, label=lab)
plt.legend()
evals = [(dtrain, 'train'), (dvalid_xy, 'eval')]
model = xgb.train(xgb_params,
                  dtrain,
                  num_boost_round=num_boost_rounds,
                  evals=evals,
                  early_stopping_rounds=early_stopping_rounds,
                  verbose_eval=10)
valid_pred = model.predict(dvalid_x, ntree_limit=model.best_ntree_limit)
print("XGBoost validation set predictions:")
print(pd.DataFrame(valid_pred).head())
print("\nMean absolute validation error:")
mean_absolute_error(y_valid, valid_pred)

if OPTIMIZE_FUDGE_FACTOR:
    mod = QuantReg(y_valid, valid_pred)
    res = mod.fit(q=.5)
    print("\nLAD Fit for Fudge Factor:")
    print(res.summary())

    fudge = res.params[0]
    print("Optimized fudge factor:", fudge)
    print("\nMean absolute validation error with optimized fudge factor: ")
    print(mean_absolute_error(y_valid, fudge * valid_pred))

    fudge **= FUDGE_FACTOR_SCALEDOWN
    print("Scaled down fudge factor:", fudge)
    print("\nMean absolute validation error with scaled down fudge factor: ")
    print(mean_absolute_error(y_valid, fudge * valid_pred))
else:
    fudge = 1.0