Example #1
0
    def _prepare_data_from_formula(
            formula: str, data: DataFrame,
            portfolios: DataFrame) -> Tuple[DataFrame, DataFrame, str]:
        na_action = NAAction(on_NA="raise", NA_types=[])
        orig_formula = formula
        if portfolios is not None:
            factors = dmatrix(formula + " + 0",
                              data,
                              return_type="dataframe",
                              NA_action=na_action)
        else:
            formula_components = formula.split("~")
            portfolios = dmatrix(
                formula_components[0].strip() + " + 0",
                data,
                return_type="dataframe",
                NA_action=na_action,
            )
            factors = dmatrix(
                formula_components[1].strip() + " + 0",
                data,
                return_type="dataframe",
                NA_action=na_action,
            )

        return factors, portfolios, orig_formula
Example #2
0
    def from_formula(cls, formula, data, *, portfolios=None):
        """
        Parameters
        ----------
        formula : str
            Patsy formula modified for the syntax described in the notes
        data : DataFrame
            DataFrame containing the variables used in the formula
        portfolios : array-like, optional
            Portfolios to be used in the model

        Returns
        -------
        model : TradedFactorModel
            Model instance

        Notes
        -----
        The formula can be used in one of two ways.  The first specified only the
        factors and uses the data provided in ``portfolios`` as the test portfolios.
        The second specified the portfolio using ``+`` to separate the test portfolios
        and ``~`` to separate the test portfolios from the factors.

        Examples
        --------
        >>> from linearmodels.datasets import french
        >>> from linearmodels.asset_pricing import TradedFactorModel
        >>> data = french.load()
        >>> formula = 'S1M1 + S1M5 + S3M3 + S5M1 S5M5 ~ MktRF + SMB + HML'
        >>> mod = TradedFactorModel.from_formula(formula, data)

        Using only factors

        >>> portfolios = data[['S1M1', 'S1M5', 'S3M1', 'S3M5', 'S5M1', 'S5M5']]
        >>> formula = 'MktRF + SMB + HML'
        >>> mod = TradedFactorModel.from_formula(formula, data, portfolios=portfolios)
        """
        na_action = NAAction(on_NA='raise', NA_types=[])
        orig_formula = formula
        if portfolios is not None:
            factors = dmatrix(formula + ' + 0',
                              data,
                              return_type='dataframe',
                              NA_action=na_action)
        else:
            formula = formula.split('~')
            portfolios = dmatrix(formula[0].strip() + ' + 0',
                                 data,
                                 return_type='dataframe',
                                 NA_action=na_action)
            factors = dmatrix(formula[1].strip() + ' + 0',
                              data,
                              return_type='dataframe',
                              NA_action=na_action)
        mod = cls(portfolios, factors)
        mod.formula = orig_formula
        return mod
Example #3
0
    def _prepare_data_from_formula(formula, data, portfolios):
        na_action = NAAction(on_NA='raise', NA_types=[])
        orig_formula = formula
        if portfolios is not None:
            factors = dmatrix(formula + ' + 0', data, return_type='dataframe', NA_action=na_action)
        else:
            formula = formula.split('~')
            portfolios = dmatrix(formula[0].strip() + ' + 0', data,
                                 return_type='dataframe', NA_action=na_action)
            factors = dmatrix(formula[1].strip() + ' + 0', data,
                              return_type='dataframe', NA_action=na_action)

        return factors, portfolios, orig_formula
Example #4
0
    def instruments(self) -> OptionalDataFrame:
        """Instruments"""
        instr = self.components['instruments']
        instr = dmatrix('0 + ' + instr, self._data, eval_env=self._eval_env,
                        return_type='dataframe', NA_action=self._na_action)

        return self._empty_check(instr)
    def backward_difference_coding(X_in, cols=None):
        """
        """

        X = X_in.copy(deep=True)

        X.columns = ['col_' + str(x) for x in X.columns.values]
        cols = ['col_' + str(x) for x in cols]

        if cols is None:
            cols = X.columns.values
            pass_thru = []
        else:
            pass_thru = [col for col in X.columns.values if col not in cols]

        bin_cols = []
        for col in cols:
            mod = dmatrix("C(%s, Diff)" % (col, ), X)
            for dig in range(len(mod[0])):
                X[str(col) + '_%d' % (dig, )] = mod[:, dig]
                bin_cols.append(str(col) + '_%d' % (dig, ))

        X = X.reindex(columns=bin_cols + pass_thru)
        X.fillna(0.0)

        return X
Example #6
0
    def backward_difference_coding(X_in, cols=None):
        """
        """

        X = X_in.copy(deep=True)

        X.columns = ['col_' + str(x) for x in X.columns.values]
        cols = ['col_' + str(x) for x in cols]

        if cols is None:
            cols = X.columns.values
            pass_thru = []
        else:
            pass_thru = [col for col in X.columns.values if col not in cols]

        bin_cols = []
        for col in cols:
            mod = dmatrix("C(%s, Diff)" % (col, ), X)
            for dig in range(len(mod[0])):
                X[str(col) + '_%d' % (dig, )] = mod[:, dig]
                bin_cols.append(str(col) + '_%d' % (dig, ))

        X = X.reindex(columns=bin_cols + pass_thru)
        X.fillna(0.0)

        return X
    def polynomial_coding(X_in, cols=None):
        """
        """

        X = X_in.copy(deep=True)

        X.columns = ['col_' + str(x) for x in X.columns.values]
        cols = ['col_' + str(x) for x in cols]

        if cols is None:
            cols = X.columns.values
            pass_thru = []
        else:
            pass_thru = [col for col in X.columns.values if col not in cols]

        X.fillna(-1, inplace=True)

        bin_cols = []
        for col in cols:
            mod = dmatrix("C(%s, Poly)" % (col, ), X)
            for dig in range(len(mod[0])):
                X[str(col) + '_%d' % (dig, )] = mod[:, dig]
                bin_cols.append(str(col) + '_%d' % (dig, ))

        X = X.reindex(columns=bin_cols + pass_thru)

        return X
Example #8
0
def test_crs_with_specific_constraint():
    from patsy.highlevel import incr_dbuilder, build_design_matrices, dmatrix
    x = (-1.5)**np.arange(20)
    # Hard coded R values for smooth: s(x, bs="cr", k=5)
    # R> knots <- smooth$xp
    knots_R = np.array([-2216.837820053100585937,
                        -50.456909179687500000,
                        -0.250000000000000000,
                        33.637939453125000000,
                        1477.891880035400390625])
    # R> centering.constraint <- t(qr.X(attr(smooth, "qrc")))
    centering_constraint_R = np.array([[0.064910676323168478574,
                                        1.4519875239407085132,
                                        -2.1947446912471946234,
                                        1.6129783104357671153,
                                        0.064868180547550072235]])
    # values for which we want a prediction
    new_x = np.array([-3000., -200., 300., 2000.])
    result1 = dmatrix("cr(new_x, knots=knots_R[1:-1], "
                      "lower_bound=knots_R[0], upper_bound=knots_R[-1], "
                      "constraints=centering_constraint_R)")

    data_chunked = [{"x": x[:10]}, {"x": x[10:]}]
    new_data = {"x": new_x}
    builder = incr_dbuilder("cr(x, df=4, constraints='center')",
                            lambda: iter(data_chunked))
    result2 = build_design_matrices([builder], new_data)[0]

    assert np.allclose(result1, result2, rtol=1e-12, atol=0.)
Example #9
0
def test_crs_with_specific_constraint():
    from patsy.highlevel import incr_dbuilder, build_design_matrices, dmatrix
    x = (-1.5)**np.arange(20)
    # Hard coded R values for smooth: s(x, bs="cr", k=5)
    # R> knots <- smooth$xp
    knots_R = np.array([
        -2216.837820053100585937, -50.456909179687500000,
        -0.250000000000000000, 33.637939453125000000, 1477.891880035400390625
    ])
    # R> centering.constraint <- t(qr.X(attr(smooth, "qrc")))
    centering_constraint_R = np.array([[
        0.064910676323168478574, 1.4519875239407085132, -2.1947446912471946234,
        1.6129783104357671153, 0.064868180547550072235
    ]])
    # values for which we want a prediction
    new_x = np.array([-3000., -200., 300., 2000.])
    result1 = dmatrix("cr(new_x, knots=knots_R[1:-1], "
                      "lower_bound=knots_R[0], upper_bound=knots_R[-1], "
                      "constraints=centering_constraint_R)")

    data_chunked = [{"x": x[:10]}, {"x": x[10:]}]
    new_data = {"x": new_x}
    builder = incr_dbuilder("cr(x, df=4, constraints='center')",
                            lambda: iter(data_chunked))
    result2 = build_design_matrices([builder], new_data)[0]

    assert np.allclose(result1, result2, rtol=1e-12, atol=0.)
    def polynomial_coding(X_in, cols=None):
        """
        """

        X = X_in.copy(deep=True)

        X.columns = ['col_' + str(x) for x in X.columns.values]
        cols = ['col_' + str(x) for x in cols]

        if cols is None:
            cols = X.columns.values
            pass_thru = []
        else:
            pass_thru = [col for col in X.columns.values if col not in cols]

        X.fillna(-1, inplace=True)

        bin_cols = []
        for col in cols:
            mod = dmatrix("C(Q(\"%s\"), Poly)" % (col, ), X)
            for dig in range(len(mod[0])):
                X[str(col) + '_%d' % (dig, )] = mod[:, dig]
                bin_cols.append(str(col) + '_%d' % (dig, ))

        X = X.reindex(columns=bin_cols + pass_thru)

        return X
def backward_difference_coding(X_in, cols=None):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)

    if cols is None:
        cols = X.columns.values
        pass_thru = []
    else:
        pass_thru = [col for col in X.columns.values if col not in cols]

    bin_cols = []
    for col in cols:
        mod = dmatrix("C(%s, Diff)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols + pass_thru)
    X.fillna(0.0)
    return X
def sum_coding(X_in, cols=None):
    """

    :param X:
    :return:
    """

    X = X_in.copy(deep=True)

    if cols is None:
        cols = X.columns.values
        pass_thru = []
    else:
        pass_thru = [col for col in X.columns.values if col not in cols]

    bin_cols = []
    for col in cols:
        mod = dmatrix("C(%s, Sum)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols + pass_thru)

    return X
def backward_difference_coding(X_in, cols=None):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)

    if cols is None:
        cols = X.columns.values
        pass_thru = []
    else:
        pass_thru = [col for col in X.columns.values if col not in cols]

    bin_cols = []
    for col in cols:
        mod = dmatrix("C(%s, Diff)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols + pass_thru)
    X.fillna(0.0)
    return X
def sum_coding(X_in, cols=None):
    """

    :param X:
    :return:
    """

    X = X_in.copy(deep=True)

    if cols is None:
        cols = X.columns.values
        pass_thru = []
    else:
        pass_thru = [col for col in X.columns.values if col not in cols]

    bin_cols = []
    for col in cols:
        mod = dmatrix("C(%s, Sum)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols + pass_thru)

    return X
Example #15
0
    def get_X(self, states: List[MdpState], actions: List[Action],
              refit_scaler: bool) -> np.ndarray:
        """
        Extract features for state-action pairs.

        :param states: States.
        :param actions: Actions.
        :param refit_scaler: Whether or not to refit the feature scaler before scaling the extracted features.
        :return: State-feature numpy.ndarray.
        """

        X = self.feature_extractor.extract(states, actions, refit_scaler)

        # if no formula, then the feature extraction result must be a numpy.ndarray to be used directly.
        if self.formula is None:
            if not isinstance(X, np.ndarray):  # pragma no cover
                raise ValueError(
                    'Expected feature extractor to return a numpy.ndarray if not a pandas.DataFrame'
                )

        # formulas only work with dataframes
        elif isinstance(X, pd.DataFrame):
            X = dmatrix(self.formula, X)

        # invalid otherwise
        else:
            raise ValueError(
                f'Invalid combination of formula {self.formula} and feature extractor result {type(X)}'
            )

        return X
    def transform(self, data):
        df_full = self.template_data
        df_new = data.copy()
        df_patsy = pd.concat([df_full, df_new])
        df_transformed = dmatrix(formula_like=self.formula, data=df_patsy, return_type='dataframe', NA_action='raise')
        df_return_data = df_transformed[-len(df_new):]

        return df_return_data
Example #17
0
 def endog(self) -> OptionalDataFrame:
     """Endogenous variables"""
     endog = self.components['endog']
     endog = dmatrix('0 + ' + endog,
                     self._data,
                     eval_env=self._eval_env,
                     return_type='dataframe',
                     NA_action=self._na_action)
     return self._empty_check(endog)
Example #18
0
 def endog(self):
     """Endogenous variables"""
     endog = self.components['endog']
     endog = dmatrix('0 + ' + endog,
                     self._data,
                     eval_env=self._eval_env,
                     return_type='dataframe',
                     NA_action=self._na_action)
     return endog
Example #19
0
 def instruments(self):
     """Instruments"""
     instr = self.components['instruments']
     instr = dmatrix('0 + ' + instr,
                     self._data,
                     eval_env=self._eval_env,
                     return_type='dataframe',
                     NA_action=self._na_action)
     return instr
Example #20
0
 def dependent(self):
     """Dependent variable"""
     dep = self.components['dependent']
     dep = dmatrix('0 + ' + dep,
                   self._data,
                   eval_env=self._eval_env,
                   return_type='dataframe',
                   NA_action=self._na_action)
     return dep
Example #21
0
 def predict(self, input_data: pd.DataFrame,
             issue_times: pd.DatetimeIndex) -> pd.DataFrame:
     resampled_data, unique_inverse = self.unique_data(
         input_data, issue_times)
     X = dmatrix(self.exog, resampled_data)
     return PredictionDataFrameBuilder(self, issue_times).build(
         np.array([
             np.maximum(model.predict(X), 0)[unique_inverse]
             for model in self.models
         ]).T, )
Example #22
0
 def endog(self) -> OptionalDataFrame:
     """Endogenous variables"""
     endog = self.components["endog"]
     endog = dmatrix(
         "0 + " + endog,
         self._data,
         eval_env=self._eval_env,
         return_type="dataframe",
         NA_action=self._na_action,
     )
     return self._empty_check(endog)
    def transform(self, data):
        '''
        First time use reduced rank  transformer.
        Second plus times, use full rank transformer. The dataframe union that contains this transformer will
        automagically merge down to the same reduced rank
        :param data:

        '''

        return_data = dmatrix(formula_like=self.formula, data=data, return_type='dataframe', NA_action='raise')
        return return_data
Example #24
0
 def dependent(self) -> DataFrame:
     """Dependent variable"""
     dep = self.components["dependent"]
     dep = dmatrix(
         "0 + " + dep,
         self._data,
         eval_env=self._eval_env,
         return_type="dataframe",
         NA_action=self._na_action,
     )
     return dep
Example #25
0
 def __get_model_fit(
     self, serie: Optional[int] = None
 ) -> sm.RegressionResultsWrapper:
     if serie is None:
         calibration_data: pd.DataFrame = self.data.calibration_data
     else:
         calibration_data: pd.DataFrame = self.data.get_serie(serie, "calibration")
     return smf.wls(
         formula=self.formula,
         weights=dmatrix(self.weight, calibration_data),
         data=calibration_data,
     ).fit()
Example #26
0
    def instruments(self) -> OptionalDataFrame:
        """Instruments"""
        instr = self.components["instruments"]
        instr = dmatrix(
            "0 + " + instr,
            self._data,
            eval_env=self._eval_env,
            return_type="dataframe",
            NA_action=self._na_action,
        )

        return self._empty_check(instr)
Example #27
0
 def estimate_trend(self, time_series_x: np.ndarray,
                    time_series_y: np.ndarray):
     # Cubic spline generation (4 knots)
     # Durrleman and Simon (1989) recommends (0.05,0.50,0.95) for natural splines
     knots_array = np.quantile(time_series_x, self.quantile)
     knots = tuple(knots_array)
     reshaped_x = dmatrix(
         f"bs(time_series, knots = {knots}, degree = {self.degree}, include_intercept=False)",
         {"time_series": time_series_x},
         return_type='dataframe')
     # Fitting Generalised linear model on transformed dataset
     reg_fitting = sm.GLM(time_series_y, reshaped_x).fit()
     # Prediction on splines
     trend = reg_fitting.predict(reshaped_x)
     return trend.to_numpy()
    def transform(self, data):
        '''
        First time use reduced rank  transformer.
        Second plus times, use full rank transformer. The dataframe union that contains this transformer will
        automagically merge down to the same reduced rank
        :param data:

        '''

        return_data = dmatrix(formula_like=self.formula, data=data, return_type='dataframe', NA_action='raise')

        if self.reference_column is None:
            self.reference_column = return_data.columns[0]
        try:
            return_data.drop(self.reference_column, axis=1, inplace=True)
        except ValueError:
            pass
        return return_data
def sum_coding(X_in):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)

    bin_cols = []
    for col in X.columns.values:
        mod = dmatrix("C(%s, Sum)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols)

    return X
def helmert_coding(X_in):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)

    bin_cols = []
    for col in X.columns.values:
        mod = dmatrix("C(%s, Helmert)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols)

    return X
def backward_difference_coding(X_in):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)

    bin_cols = []
    for col in X.columns.values:
        mod = dmatrix("C(%s, Diff)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols)
    X.fillna(0.0)
    return X
def backward_difference_coding(X_in):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)

    bin_cols = []
    for col in X.columns.values:
        mod = dmatrix("C(%s, Diff)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols)
    X.fillna(0.0)
    return X
def polynomial_coding(X_in, cols=None):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)
    if cols is None:
        cols = X.columns.values

    bin_cols = []
    for col in cols:
        mod = dmatrix("C(%s, Poly)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols)

    return X
Example #34
0
def polynomial_coding(X_in, cols=None):
    """

    :param X:
    :return:
    """

    X = copy.deepcopy(X_in)
    if cols is None:
        cols = X.columns.values

    bin_cols = []
    for col in cols:
        mod = dmatrix("C(%s, Poly)" % (col, ), X)
        for dig in range(len(mod[0])):
            X[col + '_%d' % (dig, )] = mod[:, dig]
            bin_cols.append(col + '_%d' % (dig, ))

    X = X.reindex(columns=bin_cols)

    return X
Example #35
0
 def dmatrix_lambda(x_parameter):
     return dmatrix(
         'bs(x, knots=({str_knots}), degree=3, include_intercept=False)'.
         format(str_knots=str_knots), {'x': x_parameter},
         return_type='dataframe')
                                      X_train_yes,
                                      family=sm.families.Poisson()).fit()

y_train_no, X_train_no = dmatrices(no_expr, df_las, return_type='dataframe')
poisson_training_results_no = sm.GLM(y_train_no,
                                     X_train_no,
                                     family=sm.families.Poisson()).fit()

# Evaluate the regression

print(poisson_training_results_yes.summary())
print(poisson_training_results_no.summary())

# Then use the model to predict results for Intersections

X_test_yes = dmatrix(expr, df_intersection, return_type='dataframe')
poisson_predictions_yes = poisson_training_results_yes.predict(X_test_yes)

X_test_no = dmatrix(expr, df_intersection, return_type='dataframe')
poisson_predictions_no = poisson_training_results_no.predict(X_test_no)

# And read those results into the intersection dataframe
df_intersection['predicted_yes'] = poisson_predictions_yes
df_intersection['predicted_no'] = poisson_predictions_no

# Create two new columns in the intersection dataframe, showing the code for la and constituency

intersection_index = index_table.drop_duplicates(
    subset=['Intersection']).set_index('Intersection')
df_intersection = df_intersection.join(
    intersection_index.loc[:, ('CouncilArea2011Code',
Example #37
0
import pandas as pd
from patsy.highlevel import dmatrix
"""
https://towardsdatascience.com/the-dummys-guide-to-creating-dummy-variables-f21faddb1d40
https://www.youtube.com/watch?v=WRxHfnl-Pcs
"""

url = 'http://data.princeton.edu/wws509/datasets/salary.dat'
df = pd.read_table(url, delim_whitespace=True)
print(df.head())

# use pandas
dummy = pd.get_dummies(df['sx'])
print(dummy.head())

df = pd.concat([df, dummy], axis=1)
print(df.head())

# use patsy
dummy = dmatrix("sx", df, return_type='dataframe')
df = pd.concat([df, dummy], axis=1)
print(df.head())
 def transform(self, data):
     return dmatrix(formula_like=str(data.name),
                    data=pd.DataFrame(data.apply(str)),
                    return_type='dataframe',
                    NA_action='raise').drop('Intercept', axis=1)