Ejemplo n.º 1
0
def get_trained_logit_model():
    """
    In 'data/traning_data/' specific ETFs were visually inspected and 
    white noise (0) and not white noise (1) were assigned. This data is 
    loaded here to train the logistic parameters, but you can use this 
    functionality as a template to train your own

    :ARGS:

        :class:`NoneType`

    :RETURNS:

        a fitted :class:`statsmodels.Logit` Logistic regression that 
        has been fit to the trained data
    """
    f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx')
    data = reduce(
        lambda a, b: numpy.vstack([a, b]),
        map(lambda x: f.parse(x, index_col=0)[['ln_chg', 'Y']], f.sheet_names))
    data = pandas.DataFrame(data, columns=['ln_chg', 'Y'])

    #add an intercept for the model (required by statsmodels.api.Logit
    data['intercept'] = 1.0

    #fit the model
    logit_model = Logit(endog=data['Y'], exog=data[['intercept', 'ln_chg']])
    return logit_model.fit()
Ejemplo n.º 2
0
 def prs_betaci(q, prs, df):
     (q0,q1)=q
     we_print=(q0==2)
     q0=df[prs].quantile((100-q0)/100.0),  # pandas has 99 as the highest; we have 1 as the highest
     q1=df[prs].quantile((100-q1)/100.0)
     q40=df[prs].quantile(0.4)
     q60=df[prs].quantile(0.6)   
     iids=df.index[((q0 <= df[prs]) & (df[prs] <= q1)) | ((q40 <= df[prs]) & (df[prs] <= q60))]
     if is_bin:
         data=np.vstack((expit(models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates])), 
                         (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T
         try:
             m=Logit(df.loc[iids,phe_code], data).fit(disp=0)
         except PerfectSeparationError:
             return None,(None,None),None 
         b=np.exp(m.params[1])
         ci=np.abs(np.exp(m.conf_int().iloc[1,:].values)-b)
     else:
         data=np.vstack((models['PRS']['COVAR']['train'].predict(df.loc[iids,covariates]), 
                         (q0 <= df.loc[iids,prs]) & (df.loc[iids,prs] <= q1))).T
         m=OLS(df.loc[iids,phe_code], data).fit(disp=0)
         b=m.params[1]
         ci=np.abs(m.conf_int().iloc[1,:].values-b)
     if we_print:
         print(b, [b-ci[0],b+ci[1]])
     return b,ci,df.loc[(q0 <= df[prs]) & (df[prs] <= q1),phe_code].mean()
def get_trained_logit_model():
    """
    In 'data/traning_data/' specific ETFs were visually inspected and 
    white noise (0) and not white noise (1) were assigned. This data is 
    loaded here to train the logistic parameters, but you can use this 
    functionality as a template to train your own

    :ARGS:

        :class:`NoneType`

    :RETURNS:

        a fitted :class:`statsmodels.Logit` Logistic regression that 
        has been fit to the trained data
    """
    f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx')
    data = reduce(lambda a, b: numpy.vstack([ a, b]), map(
        lambda x: f.parse(x, index_col = 0)[['ln_chg', 'Y']], f.sheet_names))
    data = pandas.DataFrame(data, columns = ['ln_chg', 'Y'])

    #add an intercept for the model (required by statsmodels.api.Logit
    data['intercept'] = 1.0

    #fit the model
    logit_model = Logit(endog = data['Y'], 
                        exog = data[['intercept', 'ln_chg']])
    return logit_model.fit()
Ejemplo n.º 4
0
    def best_in_group(self, newvars, basevars=None):
        ''' Get the best variable for score among a set of new variables '''

        if not basevars and self.add_cons:
            basevars = ['_cons']
        elif basevars and self.add_cons:
            basevars = basevars + ['_cons']
        elif not basevars and not self.add_cons:
            raise ValueError(
                'Must specify at least one covariate for baseline model')

        origmod = Logit(self.data[self.outcome],
                        self.data[basevars],
                        missing='drop').fit(disp=False)
        list_llf = []
        for cc in newvars:
            try:
                newmod = Logit(self.data[self.outcome],
                               self.data[basevars + [cc]],
                               missing='drop').fit(disp=False)
                if origmod.nobs / origmod.nobs < .95:
                    warnings.warn('Using {} causes more than 5% '\
                                  'of the sample to be dropped'.format(cc))
                list_llf.append(newmod.llf)
            except:
                if cc not in self.dropped_vars:
                    self.dropped_vars.append(cc)
                list_llf.append(origmod.llf)
        idx = list_llf.index(max(list_llf))

        return newvars[idx], 2 * (list_llf[idx] - origmod.llf)
Ejemplo n.º 5
0
def model_fit(store_path,
              X_df_path,
              y_df_path,
              feature_key="Gender",
              X_cols=[],
              testing=False,
              include_prc=False):
    if testing:
        # If testing the just print X and y columns
        print store_path, X_df_path, y_df_path, feature_key, X_cols
        return feature_key, ({"llf": 0.1}, "TEMP SUMMARY")
    ## Not testing. Fit the models and return the measures
    print store_path, X_df_path, y_df_path, feature_key, X_cols
    X = pd.read_hdf(store_path, key=X_df_path, columns=X_cols)
    y = pd.read_hdf(store_path, key=y_df_path)
    print "Created dataframes, feature_key=%s" % feature_key
    print "X.shape = %s, y.shape = %s" % (X.shape, y.shape)
    model = Logit(y, X)
    res = model.fit()
    predict = res.predict()
    measures = get_all_eval_measures(predict,
                                     model.endog,
                                     include_prc=include_prc)
    measures["llf"] = res.llf
    measures["aic"] = res.aic
    measures["bic"] = res.bic
    measures["prsquared"] = res.prsquared
    measures["df_model"] = res.df_model
    return feature_key, (measures, res.summary2())
Ejemplo n.º 6
0
def run_LR(model_dir, trainSet, testSet, timestep):
    # get shape
    H, W, C = trainSet.shape[1], trainSet.shape[2], trainSet.shape[3]
    train_len, test_len = trainSet.shape[0], testSet.shape[0]

    # get XY features
    trainX, trainY = getXSYS(trainSet, timestep)
    testX, testY = getXSYS(testSet, timestep)

    print('Train set shape: X/Y', trainX.shape, trainY.shape)
    print('Test set shape: X/Y', testX.shape, testY.shape)

    # check data imbalance
    neg, pos = np.bincount(trainX.flatten())
    weight_ratio = neg / pos
    print('Weight ratio:', round(weight_ratio, 5))

    # logit
    logit_model = Logit(trainY, trainX)
    result = logit_model.fit()
    print(result.summary2())

    # LR
    logreg = LogisticRegression(
        class_weight={1: weight_ratio})  # balance pos/neg in training set
    logreg.fit(trainX, trainY)
    predY = logreg.predict(testX)

    y_true = testY.reshape((-1, H, W, C))
    y_pred = predY.reshape((-1, H, W, C))
    print('#Positive predictions: ', y_pred[y_pred != 0].shape[0], '\n')

    return y_true, y_pred
Ejemplo n.º 7
0
def _fit_logit(X, y):
    metadata = {}
    lm = Logit(y, X)
    try:
        flm = lm.fit(method='bfgs')
        logging.info(flm.summary())
        output = format_output(flm)
        metadata = {
            'summary': str(flm.summary()),
            'summary2': str(flm.summary2())
        }
    except (np.linalg.linalg.LinAlgError, PerfectSeparationError,
            ValueError) as e:
        # Perfect separation or singular matrix - use NaN
        logging.warning(e)
        output = {
            col: {
                "coef": None,
                "std_err": None,
                "t_values": None,
                "p_values": None,
            }
            for col in X.columns
        }

    return output, metadata
Ejemplo n.º 8
0
def model_fit(y,X, X_cols, y_col, feature_key="Gender", testing=False, include_prc=False):
  if testing:
    # If testing the just print X and y columns
    print X_cols, y_col
    return feature_key, ({"llf": 0.1}, "TEMP SUMMARY")
  ## Not testing. Fit the models and return the measures
  print feature_key, X.shape, X_cols, y.shape, y_col
  X = pd.DataFrame(X,columns=X_cols)
  y = pd.Series(y,name=y_col)
  print "Created dataframes."
  model = Logit(y,X)
  res = model.fit()
  measures = get_all_eval_measures(res, model.endog, include_prc=include_prc)
  return feature_key, (measures, res.summary2())
Ejemplo n.º 9
0
def validate_data_predictors(data,
                             outcome,
                             predictors,
                             probabilities,
                             survival_time=False):
    """Validates that for each predictor column, all values are within the range 0-1

    Notes
    -----
    If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range.
    If a predictor has probability `False`, converts all values in that column with logistic regression

    Parameters
    ----------
    data : pd.DataFrame
        the data set
    outcome : str
        the column to use as 'outcome'
    predictors : list(str)
        the list of predictors for the analysis
    probabilities: list(bool)
        list marking whether a predictor is a probability
    survival_time : bool
        if the analysis is a survival time analysis
    """
    for i in range(0, len(predictors)):
        if probabilities[i]:
            #validate that any predictors with probability TRUE are b/t 0 and 1
            if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) <
                                                  0):
                raise ValueError("{val} must be between 0 and 1".format(
                    val=repr(predictors[i])))
        else:
            if survival_time:
                from statsmodels.sandbox.cox import CoxPH
                #TODO
            else:
                from statsmodels.api import Logit
                #predictor is not a probability, convert with logistic regression
                model = Logit(data[outcome], data[predictors[i]])
                data[predictors[i]] = model.fit().y_pred
    return data
Ejemplo n.º 10
0
    def _fit_backward(self):

        y_train = pd.Series(self._model.model.endog.copy(),
                            name=self.dependent_variable,
                            index=self._observations_idx)
        X_train = pd.DataFrame(self._model.model.exog,
                               columns=self._model.model.exog_names,
                               index=self._observations_idx)

        model = Logit(y_train, X_train, missing='drop')

        results = model.fit(**self._model_params)

        max_pvalue = results.pvalues.drop('Intercept').max()

        while max_pvalue > self.sig_level_removal:
            x_to_drop = results.pvalues.drop('Intercept').idxmax()
            X_train = X_train.drop(x_to_drop, axis=1)
            model = Logit(y_train, X_train, missing='drop')
            results = model.fit(**self._model_params)
            max_pvalue = results.pvalues.drop('Intercept').max()

        self._model = results

        return
Ejemplo n.º 11
0
    def _model(x: np.array, y: np.array, model: str) -> OLS or Logit:
        """
        :param x: n-D array
        :param y: 1-D array
        :param model: {'linear' or 'logistic'}
        :return: regression model
        """
        if model == 'regression':
            model_ = OLS(y, x).fit()
        else:
            model_ = Logit(y, x).fit()

        return model_
Ejemplo n.º 12
0
    def create_model_object(self):
        model_mat = copy.deepcopy(self.model_data)

        # convert booleans to floats explicitly
        for c in model_mat.columns:
            if model_mat[c].dtype == bool:
                model_mat[c] = model_mat[c].astype(float)

        # scale specified vars to N(0,1)
        for c in self.scale_vars_list:
            try:
                xbar = model_mat[c].mean()
                s = model_mat[c].std()
                model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s)
                del xbar, s
            except KeyError:
                print(
                    'Warning: specified variable to scale, %s, is not included in model covariates'
                    % c)

        # drop rows with na
        model_mat.dropna(inplace=True)

        # add constant if needed
        if self.add_constant:
            model_mat = pd.concat([
                pd.DataFrame(data=[1] * model_mat.shape[0],
                             index=model_mat.index,
                             columns=['const']), model_mat
            ],
                                  axis=1)

        self.endog_matrix = model_mat[self.endog_name]
        self.exog_matrix = model_mat[[
            c for c in model_mat.columns if c != self.endog_name
        ]]

        self.model = Logit(endog=self.endog_matrix, exog=self.exog_matrix)
Ejemplo n.º 13
0
def logit_fit(x_data, y, name='train'):
    """拟合逻辑回归,并绘制 gini,ks 曲线  \n
    参数:
    ----------
    x_data: dataframe, 已清洗好的训练数据的特征变量,函数会自动补上常数项  \n
    y: series or 1darray, 目标变量   \n
    name: 训练模型的名字  \n
    返回值:
    ----------
    result: statsmodel.api.Logit.fit() 返回结果对象  \n
    model_eval: ModelEval, 模型评估对象"""
    model_data = add_constant(x_data)
    logit_reg = Logit(y, model_data)
    result = logit_reg.fit(disp=False)

    prob = result.predict(model_data)
    model_eval = ModelEval(-prob, y, name, plot=False)

    a = "************************************"
    print(a + "  " + name + "  " + a)
    print(result.summary2())
    model_eval.giniks_plot()
    return result, model_eval
Ejemplo n.º 14
0
def validate_data_predictors(data, outcome, predictors, probabilities, survival_time=False):
    """Validates that for each predictor column, all values are within the range 0-1

    Notes
    -----
    If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range.
    If a predictor has probability `False`, converts all values in that column with logistic regression

    Parameters
    ----------
    data : pd.DataFrame
        the data set
    outcome : str
        the column to use as 'outcome'
    predictors : list(str)
        the list of predictors for the analysis
    probabilities: list(bool)
        list marking whether a predictor is a probability
    survival_time : bool
        if the analysis is a survival time analysis
    """
    for i in range(0, len(predictors)):
        if probabilities[i]:
            #validate that any predictors with probability TRUE are b/t 0 and 1
            if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) < 0):
                raise ValueError("{val} must be between 0 and 1"
                                 .format(val=repr(predictors[i])))
        else:
            if survival_time:
                from statsmodels.sandbox.cox import CoxPH
                #TODO
            else:
                from statsmodels.api import Logit 
                #predictor is not a probability, convert with logistic regression
                model = Logit(data[outcome], data[predictors[i]])
                data[predictors[i]] = model.fit().y_pred
    return data
Ejemplo n.º 15
0
    def fit(self):
        """
        Fit the model; save and report results. This currently uses the Statsmodels
        Logit class with default estimation settings. (It will shift to ChoiceModels
        once more infrastructure is in place.)
        
        The `fit()` method can be run as many times as desired. Results will not be saved 
        with Orca or ModelManager until the `register()` method is run. 
        
        Parameters
        ----------
        None
        
        Returns
        -------
        None
        
        """
        # TO DO - verify that params are in place for estimation

        # Workaround for a temporary statsmodels bug:
        # https://github.com/statsmodels/statsmodels/issues/3931
        from scipy import stats
        stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

        df = get_data(tables=self.tables,
                      filters=self.filters,
                      model_expression=self.model_expression)

        m = Logit.from_formula(data=df, formula=self.model_expression)
        results = m.fit()

        self.name = self._generate_name()
        self.summary_table = str(results.summary())
        print(self.summary_table)

        # For now, we can just save the summary table and the fitted parameters. Later on
        # we will probably want programmatic access to more details about the fit (e.g.
        # for autospec), but we can add that when it's needed.

        self.fitted_parameters = results.params.tolist(
        )  # params is a pd.Series
Ejemplo n.º 16
0
FullRaw = Tip.drop(['sex', 'day', 'time'], axis=1)
FullRaw = pd.concat([FullRaw, dummyDf], axis=1)
FullRaw['smoker'] = np.where(FullRaw['smoker'] == 'No', 1, 0)

from sklearn.model_selection import train_test_split

Train, Test = train_test_split(FullRaw, test_size=0.3, random_state=123)

Train_X = Train.drop(['smoker'], axis=1)
Train_Y = Train['smoker'].copy()
Test_X = Test.drop(['smoker'], axis=1)
Test_Y = Test['smoker'].copy()

from statsmodels.api import Logit

M1_Model = Logit(Train_Y, Train_X).fit()
M1_Model.summary()

Test_pred = M1_Model.predict(Test_X)

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

Test['Test_prob'] = Test_pred
Test['Test_Class'] = np.where(Test['Test_prob'] > 0.5, 1, 0)

Con_Mat = confusion_matrix(Test['Test_Class'], Test_Y)
sum(np.diag(Con_Mat)) / Test_Y.shape[0] * 100

from sklearn.metrics import roc_auc_score, roc_curve

ROC = roc_auc_score(Test['Test_Class'], Test_Y)
Ejemplo n.º 17
0
    def __init__(self,
                 outcome,
                 test_vars,
                 df,
                 init_vars=None,
                 add_cons=True,
                 disp=True,
                 cutoff_ord1=1,
                 cutoff_ord2=2.71,
                 t_strata=1,
                 n_min='auto'):

        # double checking some inputs
        if type(outcome) != str:
            raise ValueError(
                'y must be a string variable name in the DataFrame.')
        if type(test_vars) != list:
            raise ValueError('X must be a list of covariates to test.')

        self.outcome = outcome
        self.test_vars = test_vars
        self.add_cons = add_cons
        self.init_vars = init_vars

        if init_vars and type(init_vars) == str:
            covs = [init_vars] + test_vars
        elif init_vars and type(init_vars) == list:
            covs = init_vars + test_vars
        else:
            covs = test_vars

        if n_min == 'auto':
            n_min_strata = len(covs) + 2
            n_min_tc = 3
        else:
            if type(n_min) != dict:
                raise ValueError('n_min must be "auto" or a dictionary')
            elif ('n_min_tc' not in n_min) or ('n_min_strata' not in n_min):
                raise ValueError('Must specify both n_min_strata (ex. K+2) '\
                                    'and n_min_tc (ex. 3)')
            n_min_strata = n_min['n_min_strata']
            n_min_tc = n_min['n_min_tc']

        if 'propscore' in covs + [outcome] or 'logodds' in covs + [outcome]:
            raise ValueError(
                'You cannot have variables labeled "propscore" or "logodds"')

        data = df[[outcome] + covs].copy()

        ord2_vars = []
        dropped_vars = []
        # looping through covariates
        for idx, cc in enumerate(covs):
            # first a gut check to make sure all the variables aren't singular
            if len(data[cc].dropna().unique()) == 1:
                raise ValueError('{} only takes on one value'.format(cc))

            # for all variables generate the interaction terms
            if idx < len(covs):
                for jj in covs[idx + 1:]:
                    testvar = data[cc] * data[jj]
                    if (not testvar.equals(data[cc])
                            and not testvar.equals(data[jj])
                            and len(testvar.dropna().unique()) > 1):
                        data.loc[:, 'X'.join([cc, jj])] = testvar
                        ord2_vars.append('X'.join([cc, jj]))
                    else:
                        dropped_vars.append('X'.join([cc, jj]))

            # for continuous variables, generate squared term
            if not data[cc].equals(data[cc]**2):
                data.loc[:, '{}_sq'.format(cc)] = data[cc]**2
                ord2_vars.append('{}_sq'.format(cc))
            else:
                dropped_vars.append('{}_sq'.format(cc))

        if add_cons:
            data.loc[:, '_cons'] = 1

        self.data = data
        self.dropped_vars = dropped_vars
        self.test_vars_ord2 = ord2_vars

        # =====================================================================
        # Actually calculating propensity score
        # =====================================================================
        linear = self.model_from_group(self.test_vars,
                                       cutoff=cutoff_ord1,
                                       init_vars=self.init_vars)

        squared = self.model_from_group(ord2_vars,
                                        cutoff=cutoff_ord2,
                                        init_vars=linear)

        if add_cons:
            self.model = Logit(self.data[self.outcome],
                               self.data[squared + ['_cons']],
                               missing='drop').fit(disp=False)
        else:
            self.model = Logit(self.data[self.outcome],
                               self.data[squared],
                               missing='drop').fit(disp=False)

        self.logodds = self.model.fittedvalues.rename('logodds')
        self.propscore = Series(self.model.predict(),
                                index=self.logodds.index,
                                name='propscore')
        self.trim_range = self.calc_trim(self.propscore)
        self.in_trim = (
            self.propscore.ge(self.trim_range[0])
            & self.propscore.le(self.trim_range[1])).rename('in_trim')
        self.strata = self.stratify(self.data[self.outcome],
                                    self.logodds,
                                    t_max=t_strata,
                                    n_min_strata=n_min_strata,
                                    n_min_tc=n_min_tc)

        if disp:
            print(self.model.summary())
            print('The following vars were infeasible: {}'.format(', '.join(
                self.dropped_vars)))
            print('Stratification produced {} strata'.format(
                len(self.strata.dropna().unique())))
Ejemplo n.º 18
0
    def fit(self,
            data,
            formula,
            categorical_variables=None,
            max_iterations=100,
            show_results=True,
            confidence_intervals=True,
            use_patsy_notation=False,
            n_decimals=3):
        """
        Fit model to the given data using formula.

        Parameters
        ----------
        data : pd.DataFrame 
            Data to fit a model  
        formula : str 
            Formula of a model specification, e.g. 'y ~ x1 + x2'; 
            should be passed either in Patsy (statsmodels) notation
            or using the following rules: 
            '*' for interaction of the variables,
            ':' for interaction & main effects, 
            i.e., 'y ~ x:z' equals to 'y ~ x + z + x*z' (unlike the Patsy notation).
            If you use Patsy notation, please specify the parameter use_patsy_notation=True.
        categorical_variables : list 
            List of names of the variables that should be considered categorical.
            These variables would be automatically converted into sets of dummy variables.
            If you want to use this option, please make sure that you don't have nested names of variables
            (e.g. 'imdb' and 'imdb_rate' at the same time), otherwise this option results in an incorrect procedure.
        max_iterations : int 
            Maximum iterations for convergence
        show_results : bool 
            Whether to show results of analysis
        confidence_intervals : bool 
            Whether to include coefficients' confidence intervals in the summary table
        use_patsy_notation : bool 
            Turn this on if you use strictly Patsy's rules to define a formula.
            See more: https://patsy.readthedocs.io/en/latest/quickstart.html
        n_decimals : int 
            Number of digits to round results when showing them

        Returns
        -------
        self
            The current instance of the BinaryLogisticRegression class
        """

        self._data = data.copy()

        self.categorical_variables = categorical_variables
        self._show_ci = confidence_intervals
        self.max_iterations = max_iterations

        if '=' in formula:
            formula = formula.replace('=', '~')

        if not use_patsy_notation:
            formula = formula.replace('*', '^').replace(':',
                                                        '*').replace('^', ':')

        self.formula = formula

        self.dependent_variable = self.formula.split('~')[0].strip()

        dep_cats = get_categories(self._data[self.dependent_variable])
        self._dep_cats = dep_cats

        if len(dep_cats) != 2:
            raise ValueError(
                f"""A dependent variable should have exactly 2 unique categories.
            The provided variable has {len(dep_cats)}.""")

        self._mapper = {dep_cats[0]: 0, dep_cats[1]: 1}
        self._inv_mapper = {0: dep_cats[0], 1: dep_cats[1]}

        if not is_numeric_dtype(self._data[self.dependent_variable]):
            self._data[self.dependent_variable] = self._data[
                self.dependent_variable].map(self._mapper).astype(int)

        #won't work correctly if some variables have nested names (e.g. kinopoisk_rate and kinopoisk_rate_count)
        if categorical_variables is not None:
            if not isinstance(categorical_variables, list):
                raise ValueError(
                    f"""Categorical variables should be passed as list.
                Type {type(categorical_variables)} was passed instead.""")
            else:
                for variable in categorical_variables:
                    formula = formula.replace(variable, f'C({variable})')
        self._optimizer = 'newton'
        try:
            self._model = logit(formula=formula, data=self._data).fit(
                maxiter=self.max_iterations,
                warn_convergence=False,
                disp=False,
                method=self._optimizer,
                full_output=True)
        except np.linalg.LinAlgError:
            self._optimizer = 'bfgs'
            self._model = logit(formula=formula, data=self._data).fit(
                maxiter=self.max_iterations,
                warn_convergence=False,
                disp=False,
                method=self._optimizer,
                full_output=True)

        self._model_params = {
            'maxiter': self.max_iterations,
            'warn_convergence': False,
            'disp': False,
            'method': self._optimizer,
            'full_output': True
        }

        self._observations_idx = list(self._model.fittedvalues.index)
        self.variables_excluded = self._identify_variables_without_variation()

        if len(self.variables_excluded) > 0:
            y = pd.Series(self._model.model.endog.copy(),
                          index=self._observations_idx,
                          name=self.dependent_variable)
            X = self._remove_variables_without_variation()
            self._model = Logit(y, X, missing='drop').fit(**self._model_params)
            self.variables_excluded = [
                BinaryLogisticRegression._translate_from_patsy_notation(x)
                for x in self.variables_excluded
            ]

        if self.method == 'backward':
            self._fit_backward()

        self._get_statistics_from_model()

        self.predictions = self.predict()
        self.classification_table = self.get_classification_table()
        self.precision_and_recall = self.get_precision_and_recall()

        if show_results:
            self.show_results(n_decimals)

        if len(self.variables_excluded) > 0:
            print('------------------\n')
            print(
                f"Following variables were excluded due to zero variance: {'; '.join(self.variables_excluded)}"
            )

        return self
Ejemplo n.º 19
0
class BinaryLogisticRegression:
    """
    Class for binary logistic regression models based on the excellent statsmodels package.
    
    Parameters
    ----------
    method : 'enter' or 'backward' 
        Method for predictors selection 
    include_constant : bool 
        (CURRENTLY UNAVAILIABLE) Whether to include constant in the model
    classification_cutoff : float 
        Minimum probability to assign a prediction value 1
    sig_level_entry : float 
        (CURRENTLY UNAVAILIABLE) Max significance level to include predictor in the model 
    sig_level_removal : float 
        Min significance level to exclude predictor from the model

    Attributes
    ----------
    predictions : pd.Series
        Predicted values
    classification_table : pd.DataFrame
        A classification table
    precision_and_recall : pd.DataFrame
        Table with precision, recall, and F1-score of the model
    variables_excluded : list
        Variables excluded because of zero variance
    variables_included : list
        Variables included in a model
    N : int
        Number of observations included in a model
    r2_pseudo_macfadden : float
        MacFadden's pseudo coefficient of determination
    r2_pseudo_cox_snell : float
        Cox&Snell's pseudo coefficient of determination
    r2_pseudo_nagelkerke : float
        Nagelkerke's pseudo coefficient of determination
    loglikelihood : float
        -2LL
    coefficients : pd.Series
        Regression coefficients
    coefficients_sterrors : pd.Series
        Standard errors of regression coefficients
    coefficients_wald_statistics : pd.Series
        Wald statistic of regression coefficients
    coefficients_zvalues : pd.Series
        z-statistic of regression coefficients
    coefficients_pvalues : pd.Series
        P-values of regression coefficients
    coefficients_exp : pd.Series
        e ** regression coefficients
    """
    def __init__(
        self,
        method='enter',
        include_constant=True,
        classification_cutoff=0.5,
        sig_level_entry=0.05,
        sig_level_removal=0.05,
    ):
        self.method = method.lower().strip()
        self.include_constant = include_constant
        self.classification_cutoff = classification_cutoff
        self.sig_level_entry = sig_level_entry
        self.sig_level_removal = sig_level_removal

    def fit(self,
            data,
            formula,
            categorical_variables=None,
            max_iterations=100,
            show_results=True,
            confidence_intervals=True,
            use_patsy_notation=False,
            n_decimals=3):
        """
        Fit model to the given data using formula.

        Parameters
        ----------
        data : pd.DataFrame 
            Data to fit a model  
        formula : str 
            Formula of a model specification, e.g. 'y ~ x1 + x2'; 
            should be passed either in Patsy (statsmodels) notation
            or using the following rules: 
            '*' for interaction of the variables,
            ':' for interaction & main effects, 
            i.e., 'y ~ x:z' equals to 'y ~ x + z + x*z' (unlike the Patsy notation).
            If you use Patsy notation, please specify the parameter use_patsy_notation=True.
        categorical_variables : list 
            List of names of the variables that should be considered categorical.
            These variables would be automatically converted into sets of dummy variables.
            If you want to use this option, please make sure that you don't have nested names of variables
            (e.g. 'imdb' and 'imdb_rate' at the same time), otherwise this option results in an incorrect procedure.
        max_iterations : int 
            Maximum iterations for convergence
        show_results : bool 
            Whether to show results of analysis
        confidence_intervals : bool 
            Whether to include coefficients' confidence intervals in the summary table
        use_patsy_notation : bool 
            Turn this on if you use strictly Patsy's rules to define a formula.
            See more: https://patsy.readthedocs.io/en/latest/quickstart.html
        n_decimals : int 
            Number of digits to round results when showing them

        Returns
        -------
        self
            The current instance of the BinaryLogisticRegression class
        """

        self._data = data.copy()

        self.categorical_variables = categorical_variables
        self._show_ci = confidence_intervals
        self.max_iterations = max_iterations

        if '=' in formula:
            formula = formula.replace('=', '~')

        if not use_patsy_notation:
            formula = formula.replace('*', '^').replace(':',
                                                        '*').replace('^', ':')

        self.formula = formula

        self.dependent_variable = self.formula.split('~')[0].strip()

        dep_cats = get_categories(self._data[self.dependent_variable])
        self._dep_cats = dep_cats

        if len(dep_cats) != 2:
            raise ValueError(
                f"""A dependent variable should have exactly 2 unique categories.
            The provided variable has {len(dep_cats)}.""")

        self._mapper = {dep_cats[0]: 0, dep_cats[1]: 1}
        self._inv_mapper = {0: dep_cats[0], 1: dep_cats[1]}

        if not is_numeric_dtype(self._data[self.dependent_variable]):
            self._data[self.dependent_variable] = self._data[
                self.dependent_variable].map(self._mapper).astype(int)

        #won't work correctly if some variables have nested names (e.g. kinopoisk_rate and kinopoisk_rate_count)
        if categorical_variables is not None:
            if not isinstance(categorical_variables, list):
                raise ValueError(
                    f"""Categorical variables should be passed as list.
                Type {type(categorical_variables)} was passed instead.""")
            else:
                for variable in categorical_variables:
                    formula = formula.replace(variable, f'C({variable})')
        self._optimizer = 'newton'
        try:
            self._model = logit(formula=formula, data=self._data).fit(
                maxiter=self.max_iterations,
                warn_convergence=False,
                disp=False,
                method=self._optimizer,
                full_output=True)
        except np.linalg.LinAlgError:
            self._optimizer = 'bfgs'
            self._model = logit(formula=formula, data=self._data).fit(
                maxiter=self.max_iterations,
                warn_convergence=False,
                disp=False,
                method=self._optimizer,
                full_output=True)

        self._model_params = {
            'maxiter': self.max_iterations,
            'warn_convergence': False,
            'disp': False,
            'method': self._optimizer,
            'full_output': True
        }

        self._observations_idx = list(self._model.fittedvalues.index)
        self.variables_excluded = self._identify_variables_without_variation()

        if len(self.variables_excluded) > 0:
            y = pd.Series(self._model.model.endog.copy(),
                          index=self._observations_idx,
                          name=self.dependent_variable)
            X = self._remove_variables_without_variation()
            self._model = Logit(y, X, missing='drop').fit(**self._model_params)
            self.variables_excluded = [
                BinaryLogisticRegression._translate_from_patsy_notation(x)
                for x in self.variables_excluded
            ]

        if self.method == 'backward':
            self._fit_backward()

        self._get_statistics_from_model()

        self.predictions = self.predict()
        self.classification_table = self.get_classification_table()
        self.precision_and_recall = self.get_precision_and_recall()

        if show_results:
            self.show_results(n_decimals)

        if len(self.variables_excluded) > 0:
            print('------------------\n')
            print(
                f"Following variables were excluded due to zero variance: {'; '.join(self.variables_excluded)}"
            )

        return self

    def _fit_backward(self):

        y_train = pd.Series(self._model.model.endog.copy(),
                            name=self.dependent_variable,
                            index=self._observations_idx)
        X_train = pd.DataFrame(self._model.model.exog,
                               columns=self._model.model.exog_names,
                               index=self._observations_idx)

        model = Logit(y_train, X_train, missing='drop')

        results = model.fit(**self._model_params)

        max_pvalue = results.pvalues.drop('Intercept').max()

        while max_pvalue > self.sig_level_removal:
            x_to_drop = results.pvalues.drop('Intercept').idxmax()
            X_train = X_train.drop(x_to_drop, axis=1)
            model = Logit(y_train, X_train, missing='drop')
            results = model.fit(**self._model_params)
            max_pvalue = results.pvalues.drop('Intercept').max()

        self._model = results

        return

    def _identify_variables_without_variation(self):
        if self.include_constant:
            mask = self._model.model.exog.var(axis=0)[1:] == 0
        else:
            mask = self._model.model.exog.var(axis=0) == 0

        variables_included = [
            x for x in list(self._model.params.index) if x != 'Intercept'
        ]

        return list(np.array(variables_included)[mask])

    def _remove_variables_without_variation(self):
        X = pd.DataFrame(self._model.model.exog,
                         columns=self._model.model.exog_names,
                         index=self._observations_idx)
        X = X.drop(self.variables_excluded, axis=1)
        return X

    @staticmethod
    def _translate_from_patsy_notation(effect):
        effect = effect\
        .replace(':', ' * ')\
        .replace('C(', '')\
        .replace('T.', '')\
        .replace('[', ' = "')\
        .replace(']', '"')\
        .replace(')', '')

        return effect

    def _get_statistics_from_model(self):

        self.N = self._model.nobs
        self.r2_pseudo_macfadden = self._model.prsquared
        self.r2_pseudo_cox_snell = 1 - np.exp(-self._model.llr / self.N)
        self.r2_pseudo_nagelkerke = self.r2_pseudo_cox_snell / (
            1 - np.exp(-(-2 * self._model.llnull) / self.N))
        self.loglikelihood = -2 * self._model.llf

        self.coefficients = self._model.params.copy()
        self.coefficients_sterrors = self._model.bse.copy()
        self.coefficients_wald_statistics = self._model.tvalues.copy()**2
        self.coefficients_zvalues = self._model.tvalues.copy()
        self.coefficients_pvalues = self._model.pvalues.copy()
        self.coefficients_exp = self.coefficients.apply(np.exp)

        variables_included = [
            x for x in list(self.coefficients.index) if x != 'Intercept'
        ]
        self._variables_included_patsy = variables_included.copy()

        variables_included = [
            BinaryLogisticRegression._translate_from_patsy_notation(x)
            for x in variables_included
        ]

        self.variables_included = variables_included

        if self.include_constant:
            self._params_idx = ['Constant'] + variables_included
        else:
            self._params_idx = variables_included.copy()

        for stats in [
                self.coefficients, self.coefficients_pvalues,
                self.coefficients_sterrors, self.coefficients_zvalues,
                self.coefficients_wald_statistics, self.coefficients_exp
        ]:
            stats.index = self._params_idx

        return

    def summary(self):
        """
        Summary table with requested information related to regression coefficients.
        
        Returns
        -------
        pd.DataFrame
            A summary table
        """

        statistics = [
            self.coefficients, self.coefficients_sterrors,
            self.coefficients_wald_statistics, self.coefficients_pvalues,
            self.coefficients_exp
        ]

        columns = ['B', 'Std. Error', 'Wald', 'p-value', 'Exp(B)']

        if self._show_ci:
            statistics.append(self.coefficients_confidence_interval)
            columns.extend(list(self.coefficients_confidence_interval.columns))

        statistics = pd.concat(statistics, axis=1)

        statistics.columns = columns

        statistics.index = self._params_idx

        return statistics

    @property
    def coefficients_confidence_interval(self):

        ci = self._model.conf_int()
        ci.index = self._params_idx

        ci.columns = [f'LB CI (95%)', f'UB CI (95%)']
        return ci

    def show_results(self, n_decimals):
        """
        Show results of the analysis in a readable form.
        
        Parameters
        ----------
        n_decimals : int 
            Number of digits to round results when showing them
        """
        phrase = 'method {}'

        print('\nLOGISTIC REGRESSION SUMMARY\n')
        if self._model.mle_retvals['converged'] == True:
            print('Estimation was converged successfully.')
        else:
            print('Estimation was NOT converged successfully.')
            print('Please enlarge the number of iterations.')
        print('------------------\n')
        print('Dependent variable encoding')
        display(self.get_dependent_variable_codes().style\
                    .set_caption(phrase.format('.get_dependent_variable_codes()')))
        print('------------------\n')
        print('Model summary')
        display(self.summary_r2().style\
                    .set_caption(phrase.format('.summary_r2()'))\
                    .set_precision(n_decimals))
        print('------------------\n')
        print('Classification table')
        display(self.get_classification_table().style\
                    .set_caption(phrase.format('.get_classification_table()'))\
                    .set_precision(n_decimals))
        print('------------------\n')
        print('Precision and recall')
        display(self.get_precision_and_recall().style\
                    .set_caption(phrase.format('.get_precision_and_recall()'))\
                    .set_precision(n_decimals))
        print('------------------\n')
        print('Coefficients')
        display(self.summary().style\
                    .format(None, na_rep="")\
                    .set_caption(phrase.format('.summary()'))\
                    .set_precision(n_decimals))

    def summary_r2(self):
        """
        Summary table with information related to pseudo coefficients of determination.

        Returns
        -------
        pd.DataFrame
            A summary table
        """
        ll = self.loglikelihood
        mf = self.r2_pseudo_macfadden
        cs = self.r2_pseudo_cox_snell
        nk = self.r2_pseudo_nagelkerke

        statistics = [[ll, mf, cs, nk]]
        columns = [
            '-2 Log likelihood',
            "MacFadden's Pseudo R2",
            "Cox&Snell's Pseudo R2",
            "Nagelkerke's Pseudo R2",
        ]

        statistics = pd.DataFrame(statistics, columns=columns, index=[''])

        return statistics

    def get_dependent_variable_codes(self):
        """
        Get information on how categories of a dependent variable were encoded.

        Returns
        -------
        pd.DataFrame
            A table explaining encodings
        """
        mapper = self._mapper
        result = pd.DataFrame(
            [list(mapper.items())[0],
             list(mapper.items())[1]],
            columns=['Original value', 'Model value'],
            index=['', ' '])
        return result

    def get_classification_table(self):
        """
        Get a classification table.

        Returns
        -------
        pd.DataFrame
            A classification table
        """
        all_categories = self._dep_cats

        classification = pd.DataFrame(self._model.pred_table(),
                                      columns=self._dep_cats,
                                      index=self._dep_cats)

        classification.index.name = 'Observed'
        classification.columns.name = 'Predicted'
        classification['All'] = classification.sum(axis=1)
        classification.loc['All'] = classification.sum()

        n = classification.loc['All', 'All']
        for category in all_categories:
            classification.loc[category, 'All'] = classification.loc[
                category, category] / classification.loc[category, 'All'] * 100
            classification.loc[
                'All',
                category] = classification.loc['All', category] / n * 100

        classification.loc['All', 'All'] = np.diagonal(
            classification.loc[all_categories, all_categories]).sum() / n * 100
        classification.index = all_categories + ['Percent predicted']
        classification.index.name = 'Observed'
        classification.columns = all_categories + ['Percent correct']
        classification.columns.name = 'Predicted'
        return classification

    def get_precision_and_recall(self):
        """
        Estimate precision, recall, and F-score for all the categories.

        Returns
        -------
        pd.DataFrame
            A table with estimated metrics
        """

        preds = self.classification_table.iloc[:-1, :-1]
        results = []
        categories = list(preds.index)
        for current_category in categories:
            idx = [cat for cat in categories if cat != current_category]
            tp = preds.loc[current_category, current_category]
            fp = preds.loc[idx, current_category].sum()
            fn = preds.loc[current_category, idx].sum()
            if fp == 0:
                precision = 0
            else:
                precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            if precision + recall != 0:
                f1 = 2 * (precision * recall) / (precision + recall)
            else:
                f1 = 0
            results.append([precision, recall, f1])
        results = pd.DataFrame(results,
                               index=categories,
                               columns=['Precision', 'Recall', 'F score'])
        results.loc['Mean'] = results.mean()
        return results

    def predict(
        self,
        data=None,
        group_membership=True,
        probability=False,
        logit=False,
        add_to_data=False,
    ):
        """
        Predict values of a dependent variable using the fitted model.
        
        Parameters
        ----------
        data : pd.DataFrame 
            Data for prediction; 
            may be not specified if you want to predict values for the same data that were used to fit a model
        group_membership : bool 
            Whether to predict observation's membership 
            to categories of a dependent variable
        probability : bool 
            Whether to predict exact probability
        logit : bool 
            Whether to predict a logit value 
        add_to_data : bool 
            Whether to merge predictions with the given data.
            Currently, this option returns data with a sorted index

        Returns
        -------
        pd.DataFrame
            Predictions
        """
        name_memb = f'{self.dependent_variable} (predicted)'
        name_prob = f'{self.dependent_variable} (predicted prob.)'
        name_logit = f'{self.dependent_variable} (predicted logit)'

        all_columns = [name_memb, name_prob, name_logit]

        columns_to_show = []
        if group_membership:
            columns_to_show.append(name_memb)
        if probability:
            columns_to_show.append(name_prob)
        if logit:
            columns_to_show.append(name_logit)

        cutoff = self.classification_cutoff

        if data is None:
            data_init = self._data.copy()
            logit = self._model.fittedvalues
            prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x)))
            memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map(
                self._inv_mapper)
            result = pd.DataFrame(index=self._observations_idx,
                                  columns=all_columns)
            result[name_memb] = memb
            result[name_prob] = prob
            result[name_logit] = logit
            result = result[columns_to_show]
            if add_to_data:
                return pd.concat([data_init, result], axis=1)
            else:
                return result

        else:
            aux_model = logit(self.formula, data).fit(**self._model_params)
            aux_data_idx = aux_model.fittedvalues.index
            aux_data_cols = aux_model.model.exog_names
            aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\
                              for x in aux_data_cols]
            aux_data = pd.DataFrame(aux_model.model.exog,
                                    index=aux_data_idx,
                                    columns=aux_data_cols)
            aux_X = add_constant(aux_data[self.variables_included].copy())
            aux_y = aux_model.model.endog.copy()

            aux_model = Logit(aux_y, aux_X,
                              missing='drop').fit(**self._model_params)

            logit = aux_model.fittedvalues
            prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x)))
            memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map(
                self._inv_mapper)
            result = pd.DataFrame(index=aux_data_idx, columns=all_columns)
            result[name_memb] = memb
            result[name_prob] = prob
            result[name_logit] = logit
            result = result[columns_to_show]
            if add_to_data:
                return pd.concat([data, result], axis=1)
            else:
                return result

    def save_independent_variables(self, data=None, add_to_data=False):
        """
        Produce values of independent variable remained in a fitted model.
        This option is useful if you don't create dummy variables or interaction effects manually
        but want to use them in a further analysis. Only variables remained in a model are returned
        (those that are shown in a summary table).
        
        Parameters
        ----------
        data : pd.DataFrame 
            Data for which independent variables are requested; 
            may be not specified if you want to save values for the same data that were used to fit a model
        add_to_data : bool 
            Whether to merge new values with the given data.
            Currently, this option returns data with a sorted index
        
        Returns
        -------
        pd.DataFrame
            Values of independent variables
        """

        if data is None:
            data = self._data.copy()
            if self.include_constant:
                result = self._model.model.exog[:, 1:].copy()
            else:
                result = self._model.model.exog.copy()
            columns = [x for x in self.variables_included if x != 'Constant']
            result = pd.DataFrame(result,
                                  columns=columns,
                                  index=self._observations_idx)

        else:
            aux_model = logit(self.formula, data).fit(**self._model_params)
            aux_data_idx = aux_model.fittedvalues.index
            aux_data_cols = aux_model.model.exog_names
            aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\
                              for x in aux_data_cols]
            aux_data = pd.DataFrame(aux_model.model.exog,
                                    index=aux_data_idx,
                                    columns=aux_data_cols)
            result = aux_data[self.variables_included]

        if add_to_data:
            result = pd.concat([data, result], axis=1)

        return result

    def save_residuals(self,
                       unstandardized=True,
                       standardized=False,
                       logit=False,
                       deviance=False,
                       add_to_data=False):
        """
        Produce values of various residuals. Residuals are returned only for data used to fit a model.
        
        Parameters
        ----------
        unstandardized : bool 
            Whether to save unstandardized (raw) residuals
        standardized : bool 
            Whether to save standardized (z-scores) residuals
        logit : bool 
            Whether to save logit residuals
        deviance : bool 
            Whether to save deviance residuals
        add_to_data : bool 
            Whether to merge new values with data.
            Currently, this option returns data with a sorted index

        Returns
        -------
        pd.DataFrame
            Requested residuals
        """

        columns_to_show = [f'{k.capitalize().replace("ized", ".").replace("eted", ".").replace("_", " ")} res.' \
                           for k, v in vars().items() if v==True and k!='add_to_data']

        result = []

        res_unstand = self._model.resid_response
        res_unstand.name = 'Unstandard. res.'

        res_stand = self._model.resid_pearson
        res_stand.name = 'Standard. res.'

        res_deviance = self._model.resid_dev
        res_deviance.name = 'Deviance res.'

        preds_prob = self.predict(group_membership=False, probability=True)

        res_logit = res_unstand / (preds_prob * (1 - preds_prob)).iloc[:, 0]
        res_logit.name = 'Logit res.'

        result.extend([res_unstand, res_stand, res_deviance, res_logit])

        result = pd.concat(result, axis=1)

        result = result[columns_to_show].copy()

        if add_to_data:
            result = pd.concat([self._data, result], axis=1)

        return result
Ejemplo n.º 20
0
    def predict(
        self,
        data=None,
        group_membership=True,
        probability=False,
        logit=False,
        add_to_data=False,
    ):
        """
        Predict values of a dependent variable using the fitted model.
        
        Parameters
        ----------
        data : pd.DataFrame 
            Data for prediction; 
            may be not specified if you want to predict values for the same data that were used to fit a model
        group_membership : bool 
            Whether to predict observation's membership 
            to categories of a dependent variable
        probability : bool 
            Whether to predict exact probability
        logit : bool 
            Whether to predict a logit value 
        add_to_data : bool 
            Whether to merge predictions with the given data.
            Currently, this option returns data with a sorted index

        Returns
        -------
        pd.DataFrame
            Predictions
        """
        name_memb = f'{self.dependent_variable} (predicted)'
        name_prob = f'{self.dependent_variable} (predicted prob.)'
        name_logit = f'{self.dependent_variable} (predicted logit)'

        all_columns = [name_memb, name_prob, name_logit]

        columns_to_show = []
        if group_membership:
            columns_to_show.append(name_memb)
        if probability:
            columns_to_show.append(name_prob)
        if logit:
            columns_to_show.append(name_logit)

        cutoff = self.classification_cutoff

        if data is None:
            data_init = self._data.copy()
            logit = self._model.fittedvalues
            prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x)))
            memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map(
                self._inv_mapper)
            result = pd.DataFrame(index=self._observations_idx,
                                  columns=all_columns)
            result[name_memb] = memb
            result[name_prob] = prob
            result[name_logit] = logit
            result = result[columns_to_show]
            if add_to_data:
                return pd.concat([data_init, result], axis=1)
            else:
                return result

        else:
            aux_model = logit(self.formula, data).fit(**self._model_params)
            aux_data_idx = aux_model.fittedvalues.index
            aux_data_cols = aux_model.model.exog_names
            aux_data_cols = [BinaryLogisticRegression._translate_from_patsy_notation(x)\
                              for x in aux_data_cols]
            aux_data = pd.DataFrame(aux_model.model.exog,
                                    index=aux_data_idx,
                                    columns=aux_data_cols)
            aux_X = add_constant(aux_data[self.variables_included].copy())
            aux_y = aux_model.model.endog.copy()

            aux_model = Logit(aux_y, aux_X,
                              missing='drop').fit(**self._model_params)

            logit = aux_model.fittedvalues
            prob = logit.apply(lambda x: np.exp(x) / (1 + np.exp(x)))
            memb = prob.apply(lambda x: 1 if x >= cutoff else 0).map(
                self._inv_mapper)
            result = pd.DataFrame(index=aux_data_idx, columns=all_columns)
            result[name_memb] = memb
            result[name_prob] = prob
            result[name_logit] = logit
            result = result[columns_to_show]
            if add_to_data:
                return pd.concat([data, result], axis=1)
            else:
                return result
Ejemplo n.º 21
0
    def fit(self, X, y, print_detail=False):
        """Stepwise logistic regression. Use Score test for entry, Wald test for remove.
        参数:
        ----------
        X: array-like, n_sample * p_features. 特征变量数据集,程序会自动添加常数项
        y: array-like, 目标变量
		print_detail: bool, 是否打印出逐步回归选择变量的细节
		返回值:
		-----------
		result: 类型同 statsmodels.api.Logit 对象 fit 方法的返回值, 逐步回归选出的模型。"""

        def score_test(Xtest, y_true, y_predict):
            """对step forward进入的变量进行Score检验。函数假设新进入的变量放在最后.
            Xtest包括vars_old(似合模型并给出预测值y_predict的),和var_new(一个待检验的新变量)。
            Score检验假设待检验变量的系数为0,所以Xtest虽然包括了它的数据,但拟合参数是按没有此变量计算出来的。"""
            u = np.dot(Xtest.T, y_true - y_predict)  # 一阶导数
            h = np.dot(Xtest.T * (y_predict * (1 - y_predict)).values.reshape(len(y_predict)), Xtest)  # 二阶导数
            score = np.dot(np.dot(u.T, np.linalg.inv(h)), u)  # score 是 1*1 数组
            p_value = chi2.sf(score, 1)  # Score统计量服从自由度为1的卡方分布
            return score, p_value

        def print_wrap(*obj):
            if print_detail:
                print(*obj)

        X = add_constant(X)
        xenter = ['const']
        xwait = list(X.columns.drop('const'))
        logit_mod = Logit(y, X[xenter])
        logit_res = logit_mod.fit(disp=0)
        y_predict = logit_res.predict(X[xenter])
        step = 0
        while xwait:  # 停止条件1:所有变量都进入了模型
            # entry test
            score = pd.Series(name='Score')
            pvalue = pd.Series(name='P>chi2')
            for xname in xwait:
                tmpX = X[xenter + [xname]]
                score[xname], pvalue[xname] = score_test(tmpX, y, y_predict)

            step += 1
            print_wrap("step {}: Variables Entry test:\n".format(step),
                       pd.concat([score, pvalue], axis=1))  # 打印运行信息

            if pvalue.min() <= self.entry:  # 最显著的变量选进来
                xin = pvalue.argmin()
                xenter.append(xin)
                xwait.remove(xin)
                print_wrap("step {0}: {1} entered.\n".format(step, xin))
            else:  # 停止条件2:没有变量符合进入标准
                print_wrap("Stopped 2: No vars can get entered any more.\n")
                break

            # remove test
            while True:  # 程序运行到这里,说明新增了变量进来
                logit_mod = Logit(y, X[xenter])
                logit_res = logit_mod.fit(disp=0)
                y_predict = logit_res.predict(X[xenter])
                test = logit_res.wald_test_terms().dframe  # wald 检验
                pvalue = test['P>chi2'].iloc[1:]  # 常数项不参与检验

                step += 1
                print_wrap("step {}: Variables remove test:\n".format(step), test)

                if pvalue.max() < self.stay:
                    xout = None
                    print_wrap("step {}: No Variables removed:\n".format(step))
                    break  # 所有变量都是显著的,不剔除变量
                else:
                    xout = pvalue.argmax()
                    xenter.remove(xout)
                    xwait.append(xout)
                    print_wrap("step {0}: {1} removed.\n".format(step, xout))

            # 停止条件3:如果刚进入的变量又剔除
            if xin == xout:
                print_wrap("Stopped 3: last var entered also got removed.\n")
                break
        else:
            print_wrap("Stopped 1: all var available got entered.\n")
        return Logit(y, X[xenter]).fit(disp=0)
Ejemplo n.º 22
0
class PropensityScore:
    """
    Parameters
    ----------
    outcome : str
        This should be the name of the binary variable to predict.
    test_vars : list
        A list of the variables to test.
    df : DataFrame
        The pandas DataFrame that contains all of the data.
    init_vars : str or list, optional
        Variables to always have included in the propensity score. The default is None.
    add_cons : Boolean, optional
        Select this to add a constant to model. The default is True.
    disp : Boolean, optional
        Display the final model including dropped variables. The default is True.
    cutoff_ord1 : Numeric, optional
        The log gain cutoff for first order covariates. The default is 1.
    cutoff_ord2 : Numeric, optional
        The log gain cutoff for second order covariates. The default is 2.71.
    t_strata : Numeric, optional
        The cutoff for the t-statistic for the calculated strata. The default is 1.
    n_min : {'n_min_strata':int1,'n_min_tc':int2} or 'auto'
        The minimum number of units in each strata or treated/control individuals in strata.
        The default is 'auto' in which case the number per strata is the number of covariates
        tested in the propensity score (just linear ones) + 2 (or K+2)
        while the minimum number of treated and control individuals per strata is 3.
        If not auto, the input needs to be a dictionary that explicitly specifies:
        {'n_min_strata':int1,'n_min_tc':int2}

    Raises
    ------
    ValueError
        If variables are improperly defined, this prints out warnings.

    Returns
    -------
    self.data : DataFrame
        This includes a new frame of just the outcome and potential covariates.
    self.dropped_vars : list
        The variables that did not make the cut for singularity reasons.
    self.model : sm.Logit.fit() model
        This is the raw model on the final set of variables from Statsmodels
    self.propscore : Series
        This is the propensity score as calculated by self.model.fittedvalues.
        This may not match dimension of data due to dropped missing values,
        but index will align properly.
    self.strata : Series
        The calculated strata. Missing propensity scores and values outside of
        min of treated group or max of control group are coded as NaN.
    self.logodds : Series
        The linearized propensity score. Will be the same dimension as propscore.
    self.test_vars_ord2: list
        The full list of tested second order variables for reference.
    self.trim_range : tuple
        The result of calculating the optimal trim min and max propensity score values.
    self.in_trim : Series (True/False)
        An array where True means that the propensity score falls within the
        trim min/max range.
    """
    def __init__(self,
                 outcome,
                 test_vars,
                 df,
                 init_vars=None,
                 add_cons=True,
                 disp=True,
                 cutoff_ord1=1,
                 cutoff_ord2=2.71,
                 t_strata=1,
                 n_min='auto'):

        # double checking some inputs
        if type(outcome) != str:
            raise ValueError(
                'y must be a string variable name in the DataFrame.')
        if type(test_vars) != list:
            raise ValueError('X must be a list of covariates to test.')

        self.outcome = outcome
        self.test_vars = test_vars
        self.add_cons = add_cons
        self.init_vars = init_vars

        if init_vars and type(init_vars) == str:
            covs = [init_vars] + test_vars
        elif init_vars and type(init_vars) == list:
            covs = init_vars + test_vars
        else:
            covs = test_vars

        if n_min == 'auto':
            n_min_strata = len(covs) + 2
            n_min_tc = 3
        else:
            if type(n_min) != dict:
                raise ValueError('n_min must be "auto" or a dictionary')
            elif ('n_min_tc' not in n_min) or ('n_min_strata' not in n_min):
                raise ValueError('Must specify both n_min_strata (ex. K+2) '\
                                    'and n_min_tc (ex. 3)')
            n_min_strata = n_min['n_min_strata']
            n_min_tc = n_min['n_min_tc']

        if 'propscore' in covs + [outcome] or 'logodds' in covs + [outcome]:
            raise ValueError(
                'You cannot have variables labeled "propscore" or "logodds"')

        data = df[[outcome] + covs].copy()

        ord2_vars = []
        dropped_vars = []
        # looping through covariates
        for idx, cc in enumerate(covs):
            # first a gut check to make sure all the variables aren't singular
            if len(data[cc].dropna().unique()) == 1:
                raise ValueError('{} only takes on one value'.format(cc))

            # for all variables generate the interaction terms
            if idx < len(covs):
                for jj in covs[idx + 1:]:
                    testvar = data[cc] * data[jj]
                    if (not testvar.equals(data[cc])
                            and not testvar.equals(data[jj])
                            and len(testvar.dropna().unique()) > 1):
                        data.loc[:, 'X'.join([cc, jj])] = testvar
                        ord2_vars.append('X'.join([cc, jj]))
                    else:
                        dropped_vars.append('X'.join([cc, jj]))

            # for continuous variables, generate squared term
            if not data[cc].equals(data[cc]**2):
                data.loc[:, '{}_sq'.format(cc)] = data[cc]**2
                ord2_vars.append('{}_sq'.format(cc))
            else:
                dropped_vars.append('{}_sq'.format(cc))

        if add_cons:
            data.loc[:, '_cons'] = 1

        self.data = data
        self.dropped_vars = dropped_vars
        self.test_vars_ord2 = ord2_vars

        # =====================================================================
        # Actually calculating propensity score
        # =====================================================================
        linear = self.model_from_group(self.test_vars,
                                       cutoff=cutoff_ord1,
                                       init_vars=self.init_vars)

        squared = self.model_from_group(ord2_vars,
                                        cutoff=cutoff_ord2,
                                        init_vars=linear)

        if add_cons:
            self.model = Logit(self.data[self.outcome],
                               self.data[squared + ['_cons']],
                               missing='drop').fit(disp=False)
        else:
            self.model = Logit(self.data[self.outcome],
                               self.data[squared],
                               missing='drop').fit(disp=False)

        self.logodds = self.model.fittedvalues.rename('logodds')
        self.propscore = Series(self.model.predict(),
                                index=self.logodds.index,
                                name='propscore')
        self.trim_range = self.calc_trim(self.propscore)
        self.in_trim = (
            self.propscore.ge(self.trim_range[0])
            & self.propscore.le(self.trim_range[1])).rename('in_trim')
        self.strata = self.stratify(self.data[self.outcome],
                                    self.logodds,
                                    t_max=t_strata,
                                    n_min_strata=n_min_strata,
                                    n_min_tc=n_min_tc)

        if disp:
            print(self.model.summary())
            print('The following vars were infeasible: {}'.format(', '.join(
                self.dropped_vars)))
            print('Stratification produced {} strata'.format(
                len(self.strata.dropna().unique())))

    def best_in_group(self, newvars, basevars=None):
        ''' Get the best variable for score among a set of new variables '''

        if not basevars and self.add_cons:
            basevars = ['_cons']
        elif basevars and self.add_cons:
            basevars = basevars + ['_cons']
        elif not basevars and not self.add_cons:
            raise ValueError(
                'Must specify at least one covariate for baseline model')

        origmod = Logit(self.data[self.outcome],
                        self.data[basevars],
                        missing='drop').fit(disp=False)
        list_llf = []
        for cc in newvars:
            try:
                newmod = Logit(self.data[self.outcome],
                               self.data[basevars + [cc]],
                               missing='drop').fit(disp=False)
                if origmod.nobs / origmod.nobs < .95:
                    warnings.warn('Using {} causes more than 5% '\
                                  'of the sample to be dropped'.format(cc))
                list_llf.append(newmod.llf)
            except:
                if cc not in self.dropped_vars:
                    self.dropped_vars.append(cc)
                list_llf.append(origmod.llf)
        idx = list_llf.index(max(list_llf))

        return newvars[idx], 2 * (list_llf[idx] - origmod.llf)

    def model_from_group(self, test_vars, cutoff, init_vars=None):
        ''' Iterate through a list over and over until no more contribution '''
        remaining = test_vars.copy()

        if init_vars and type(init_vars) == str:
            final = [init_vars].copy()
            init_vars = [init_vars]
        elif init_vars and type(init_vars) == list:
            final = init_vars.copy()
        else:
            final = []

        while len(remaining) > 0:
            temp, gain_add = self.best_in_group(remaining, basevars=final)
            if gain_add > cutoff:
                final.append(temp)
                remaining.remove(temp)
            else:
                break

        return final

    # we will define a static method so that we can call this on any generic series
    @staticmethod
    def stratify(outcome, logodds, n_min_strata, n_min_tc=3, t_max=1):
        """
    Calculate strata from a given outcome variable and log-odds. Specify the cutoff
    for the t-statistic in t_max, or the minimum number of observations for
    each strata in n_min_strata and the number of treated or control observations per
    strata in n_min_tc.
    Parameters
    ----------
    outcome : Series
        Binary variable denoting treatment outcome
    logodds : Series
        The calculated log-odds for that (transformation of propensity score).
    n_min_strata : Int
        The minimum number of observations per strata.
    n_min_tc : Int
        The minimum number of treated or control observations per strata.
        Default is 3.
    t_max : Float
        The maximum t-statistic value acceptable in a strata before splitting.
        Default is 1.

    Returns
    -------
    strata : Series
        The calculated strata. Missing propensity scores and values outside of
        min of treated group or max of control group are coded as NaN.
        """

        if type(outcome) != Series or type(logodds) != Series:
            raise ValueError('Expecting pandas series as inputs')

        # helper function to facilitate indexing
        def above_med(x):
            return (x >= x.median()).astype(int)

        outcome = outcome.rename('outcome').to_frame()
        df = outcome.join(logodds)
        minmax = df.groupby('outcome')['logodds'].agg(['max', 'min'])
        df = df.loc[df.logodds.ge(minmax.loc[1, 'min'])
                    & df.logodds.le(minmax.loc[0, 'max'])
                    & df.logodds.notnull()]

        # initialize the strata, potential blocks, and the change while loop
        df.loc[:, 'strata'] = 0
        df.loc[:, 'block'] = 0
        change = True

        while change == True:
            # get the medians of the strata
            df.loc[:,
                   'medgrp'] = df.groupby('strata')['logodds'].apply(above_med)
            for ii in df.strata.unique():
                # simplify the notation
                sub = df.loc[df.strata.eq(ii), :].copy()

                # calculate t-stat and a grouper with number of groups
                t_test = ttest(sub.loc[sub.outcome.eq(1), 'logodds'],
                               sub.loc[sub.outcome.eq(0), 'logodds'],
                               nan_policy='omit').statistic
                n = sub.groupby(['medgrp', 'outcome'])['logodds'].count()

                # make new blocks
                if (t_test > t_max and min(n) >= n_min_tc
                        and min(n.groupby('medgrp').sum()) >= n_min_strata):
                    df.loc[df.strata.eq(ii),
                           'block'] = df.loc[df.strata.eq(ii), 'medgrp']

            if df.block.sum() == 0:
                change = False
            else:
                # getting ready for next loop
                df.strata = df.groupby(['strata', 'block']).ngroup()
                df.block = 0

        return outcome.join(df.strata).strata

    # we will define a static method so that we can call this on any generic series
    @staticmethod
    def calc_trim(propscore):
        y = 1 / (propscore * (1 - propscore))

        if y.max() <= (2 / y.count()) * (y.sum()):
            return 0, 1

        for gamma in linspace(y.max(), 0, 10000):
            lhs_estimand = (gamma / y.count()) * (y.le(gamma).sum())
            rhs_estimand = (2 / y.count()) * ((y.le(gamma) * y).sum())
            if lhs_estimand < rhs_estimand:
                break

        alpha = .5 - ((.25 - (1 / gamma))**.5)

        return alpha, 1 - alpha
    Temp_Column_Name = VIF_Df.loc[VIF_Df['VIF'] == Max_VIF, 'Column_Name']
    print(Temp_Column_Name, ": ", Max_VIF)
    
    if (Max_VIF >= 10): # This condition will ensure that ONLY columns having VIF lower than 10 are NOT dropped
        print(Temp_Column_Name, Max_VIF)
        Train_X_Copy = Train_X_Copy.drop(Temp_Column_Name, axis = 1)    
        High_VIF_Column_Names.extend(Temp_Column_Name)

Train_x.drop(['Loan_Amount_Term','Self_Employed','Gender'], axis = 1, inplace=True)
Test_x.drop(['Loan_Amount_Term','Self_Employed','Gender'], axis = 1, inplace=True)

Train_x.shape
Test_x.shape

from statsmodels.api import Logit
Model1 = Logit(Train_y, Train_x).fit()
Model1.summary()

col_names = ['ApplicantIncome','Dependents']
Model2 = Logit(Train_y,Train_x.drop( col_names, axis= 1)).fit()
Model2.summary()

Test_x.drop(['ApplicantIncome','Dependents'], axis = 1, inplace=True)

Test_x['Predit'] = Model2.predict(Test_x)
Test_x.columns
Test_x['Predit'][0:6]

import numpy as np
Test_x['Test_class']=np.where(Test_x['Predit']>=0.5, 1, 0)
Ejemplo n.º 24
0
class LogisticRegression:
    def __init__(self,
                 endog_name_f=None,
                 exog_name_f=None,
                 data_f=None,
                 add_constant_f=True,
                 scale_vars_list_f=list(),
                 interaction_name_f=list(),
                 convert_bool_dict_f=dict(),
                 convert_ord_list_f=list(),
                 cat_col_omit_dict_f=dict(),
                 hier_model_vars_dict_f=dict(),
                 hier_exog_var_names_f=list(),
                 classification_threshold_f=0.5,
                 **kwds):
        self.endog_name = endog_name_f
        self.exog_name = exog_name_f
        self.data = data_f.reindex()
        self.add_constant = add_constant_f
        self.interaction_name = interaction_name_f
        self.convert_bool_dict = convert_bool_dict_f  # convert_bool_dict_f
        self.convert_ord_list = convert_ord_list_f  # convert_ord_list_f
        self.hier_model_vars_dict = hier_model_vars_dict_f
        self.hier_exog_var_names = hier_exog_var_names_f
        self.cat_col_names = list()
        self.cat_col_omit_dict = cat_col_omit_dict_f
        self.cat_col_drop_names = list()
        self.dummy_col_omit_list = list()
        self.scale_vars_list = scale_vars_list_f
        self.classification_threshold = classification_threshold_f
        self.exog_name_model = None
        self.model_data = None
        self.model = None
        self.model_result = None
        self.est_coef = dict()
        self.exog_matrix = None
        self.endog_matrix = None
        self.fitted_values = None

        self.refresh_model_data()

    def check_for_exog_conflict(self):
        t_bool_ord = set(self.convert_bool_dict.keys()).intersection(
            set(self.convert_ord_list))
        t_cat_bool = set(self.cat_col_omit_dict.keys()).intersection(
            set(self.convert_bool_dict.keys()))
        t_cat_ord = set(self.cat_col_omit_dict.keys()).intersection(
            set(self.convert_ord_list))
        t_hier_exog = set(self.exog_name).intersection(
            set(self.hier_exog_var_names))

        if len(t_bool_ord) > 0:
            print(
                'WARNING appearing in both boolean and ordinal variable lists: %s'
                % ', '.join(t_bool_ord))
        if len(t_cat_ord) > 0:
            print(
                'WARNING appearing in both categorical and ordinal variable lists: %s, ignoring categorical'
                % ', '.join(t_cat_ord))
        if len(t_cat_bool) > 0:
            print(
                'WARNING appearing in both categorical and boolean variable lists: %s, ignoring categorical'
                % ', '.join(t_cat_bool))
        if len(t_hier_exog) > 0:
            print(
                'WARNING appearing in both exogenous and hierarchical exogenous variable lists: %s'
            )

    def convert_cat_to_dummies(self):
        # get list of exogenous variables that are categorical and need to be converted
        self.cat_col_names = [
            x for x in self.exog_name
            if ((x not in list(self.convert_bool_dict.keys())) and (
                x not in self.convert_ord_list) and (self.data[x].dtype == 'O')
                )
        ]
        prefix_sep = '_'
        [
            self.cat_col_omit_dict.update(
                {x: self.data[x].mode(dropna=True).values[0]})
            for x in self.cat_col_names
            if x not in list(self.cat_col_omit_dict.keys())
        ]
        self.cat_col_drop_names = [
            k + prefix_sep + v for k, v in self.cat_col_omit_dict.items()
        ]

        if len(self.cat_col_names) > 0:
            return pd.get_dummies(self.data[self.cat_col_names],
                                  prefix_sep=prefix_sep,
                                  columns=self.cat_col_names,
                                  dtype=bool)
        else:
            return None

    def convert_to_bool(self):
        t_df = pd.DataFrame()
        t_col_names = list()
        for k, v in self.convert_bool_dict.items():
            t_col_names.append(k + '_' + v + '_TF')
            t_df = pd.concat([t_df, self.data[k] == v], axis=1)
        t_df.columns = t_col_names
        return t_df

    def convert_to_ordinal(self):
        t_df = pd.DataFrame()
        t_col_names = list()
        for c in self.convert_ord_list:
            t_col_names.append(c + '_ORD')
            t_df = pd.concat([t_df, self.data[c].astype(int)], axis=1)
        t_df.columns = t_col_names
        return t_df

    def create_hier_vars(self):
        t_df = pd.DataFrame()
        for c in self.hier_model_vars_dict.keys():
            t_model = LogisticRegression(
                endog_name_f=self.hier_model_vars_dict[c]
                ['external_model'].endog_name,
                exog_name_f=self.hier_model_vars_dict[c]
                ['external_model'].exog_name,
                data_f=self.hier_model_vars_dict[c]['external_model'].data,
                add_constant_f=self.hier_model_vars_dict[c]
                ['external_model'].add_constant,
                scale_vars_list_f=self.hier_model_vars_dict[c]
                ['external_model'].scale_vars_list,
                convert_ord_list_f=self.hier_model_vars_dict[c]
                ['external_model'].convert_ord_list,
                convert_bool_dict_f=self.hier_model_vars_dict[c]
                ['external_model'].convert_bool_dict,
                cat_col_omit_dict_f=self.hier_model_vars_dict[c]
                ['external_model'].cat_col_omit_dict,
                interaction_name_f=self.hier_model_vars_dict[c]
                ['external_model'].interaction_name,
                classification_threshold_f=self.hier_model_vars_dict[c]
                ['classification_threshold'])  #######
            t_model.create_model_object()
            t_pred_prob, t_pred_class = self.hier_model_vars_dict[c][
                'external_model'].make_predictions(
                    pred_data=t_model.exog_matrix,
                    select_coef=self.hier_model_vars_dict[c]['select_coef'])
            t_col_names = list(t_df.columns) + [c, c + '_TF']
            t_df = pd.concat([t_df, t_pred_prob, t_pred_class], axis=1)
            t_df.columns = t_col_names
        return t_df

    def create_interactions(self):
        def create_dummy_df(data_f, v1, v2, drop_list_f):
            prefix_sep = '_'

            if (data_f[v1].dtype == bool) and (data_f[v2].dtype == bool):
                # both bool - create interaction effect directly
                t_df = pd.DataFrame(data_f[v1] & data_f[v2],
                                    columns=[v1 + ' * ' + v2 + '_INT'])
                return t_df, ({v1: None}, {v2: None})
            elif (data_f[v1].dtype != bool) and (data_f[v2].dtype != bool):
                # both cat
                v1_dummies = pd.get_dummies(data_f[v1],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                v1_omit = data_f[v1].mode(
                    dropna=True).values[0] if v1 not in list(
                        drop_list_f.keys()) else drop_list_f[v1]

                v2_dummies = pd.get_dummies(data_f[v2],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                v2_omit = data_f[v2].mode(
                    dropna=True).values[0] if v2 not in list(
                        drop_list_f.keys()) else drop_list_f[v2]
                t_df = pd.DataFrame(index=data_f.index)
                for c1 in [x for x in v1_dummies.columns if x != v1_omit]:
                    for c2 in [x for x in v2_dummies.columns if x != v2_omit]:
                        t_df = pd.concat([
                            t_df,
                            pd.DataFrame(v1_dummies[c1] & v2_dummies[c2],
                                         columns=[c1 + ' * ' + c2 + '_INT'])
                        ],
                                         axis=1)
                return t_df, ({v1: v1_omit}, {v2: v2_omit})
            else:
                # one bool
                if data_f[v1].dtype == bool:
                    vb = v1
                    vd = v2
                else:
                    vb = v2
                    vd = v1
                vd_dummies = pd.get_dummies(data_f[vd],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                vd_omit = data_f[vd].mode(
                    dropna=True).values[0] if vd not in list(
                        drop_list_f.keys()) else drop_list_f[vd]
                t_df = pd.DataFrame(index=data_f.index)
                for c in [x for x in vd_dummies.columns if x != vd_omit]:
                    t_df = pd.concat([
                        t_df,
                        pd.DataFrame(data_f[vb] & vd_dummies[c],
                                     columns=[vb + ' * ' + c + '_INT'])
                    ],
                                     axis=1)
                return t_df, ({vb: None}, {vd: None})

        t_all_data = pd.concat([
            self.data, self.model_data[np.setdiff1d(self.model_data.columns,
                                                    self.data.columns)]
        ],
                               axis=1)
        t_df = pd.DataFrame(index=self.data.index)
        t_dummy_col_omit_list = list()
        for int_act_col1, int_act_col2 in self.interaction_name:
            t_dummy, t_dummy_omit = create_dummy_df(
                data_f=t_all_data,
                v1=int_act_col1,
                v2=int_act_col2,
                drop_list_f=self.cat_col_omit_dict)  #####
            t_df = pd.concat([t_df, t_dummy], axis=1)
            t_dummy_col_omit_list.append(t_dummy_omit)
            del t_dummy, t_dummy_omit
        del int_act_col1, int_act_col2
        self.dummy_col_omit_list = t_dummy_col_omit_list

        return t_df

    def code_variables(self):
        # get new variable matrices
        if len(self.convert_bool_dict) > 0:
            df_bool_f = self.convert_to_bool()
        else:
            df_bool_f = None

        if len(self.convert_ord_list) > 0:
            df_ord_f = self.convert_to_ordinal()
        else:
            df_ord_f = None

        df_cat_f = self.convert_cat_to_dummies()

        return df_bool_f, df_ord_f, df_cat_f

    def refresh_model_data(self):
        df_bool_f, df_ord_f, df_cat_f = self.code_variables()

        self.check_for_exog_conflict()

        t_remain_exog = [
            x for x in self.exog_name
            if ((x not in list(self.convert_bool_dict.keys())) and (
                x not in list(self.convert_ord_list)) and (
                    x not in self.cat_col_names))
        ]

        if df_cat_f is not None:
            df_cat_f_dropped_omit = df_cat_f[[
                c for c in df_cat_f.columns if c not in self.cat_col_drop_names
            ]]
        else:
            df_cat_f_dropped_omit = None

        self.model_data = pd.concat([
            self.data[self.endog_name], self.data[t_remain_exog], df_bool_f,
            df_ord_f, df_cat_f_dropped_omit
        ],
                                    axis=1)

        # -------------
        # add predictions for fold based on estimation of lower model
        if len(self.hier_model_vars_dict) > 0:
            df_hier_f = self.create_hier_vars()
            self.data[df_hier_f.columns] = df_hier_f
            self.model_data[self.hier_exog_var_names] = df_hier_f[
                self.hier_exog_var_names]

        # add interaction variables
        if len(self.interaction_name) > 0:
            df_interaction_f = self.create_interactions()
            self.model_data[[x for x in df_interaction_f.columns
                             ]] = df_interaction_f

        self.exog_name_model = [
            x for x in self.model_data if x != self.endog_name
        ]

    def create_model_object(self):
        model_mat = copy.deepcopy(self.model_data)

        # convert booleans to floats explicitly
        for c in model_mat.columns:
            if model_mat[c].dtype == bool:
                model_mat[c] = model_mat[c].astype(float)

        # scale specified vars to N(0,1)
        for c in self.scale_vars_list:
            try:
                xbar = model_mat[c].mean()
                s = model_mat[c].std()
                model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s)
                del xbar, s
            except KeyError:
                print(
                    'Warning: specified variable to scale, %s, is not included in model covariates'
                    % c)

        # drop rows with na
        model_mat.dropna(inplace=True)

        # add constant if needed
        if self.add_constant:
            model_mat = pd.concat([
                pd.DataFrame(data=[1] * model_mat.shape[0],
                             index=model_mat.index,
                             columns=['const']), model_mat
            ],
                                  axis=1)

        self.endog_matrix = model_mat[self.endog_name]
        self.exog_matrix = model_mat[[
            c for c in model_mat.columns if c != self.endog_name
        ]]

        self.model = Logit(endog=self.endog_matrix, exog=self.exog_matrix)

    def estimate_model(self):
        self.refresh_model_data()
        self.create_model_object()
        self.model_result = self.model.fit()
        self.est_coef.update(
            dict(
                zip(list(self.exog_matrix.columns),
                    self.model_result._results.params)))
        self.make_predictions()  # predict values of training data
        print(self.model_result.summary())

    def make_predictions(self, pred_data=None, select_coef=None):
        def utility_calc(coef_fff, data_fff):
            return np.matmul(np.array(data_fff),
                             np.array(coef_fff).reshape(len(coef_fff),
                                                        1)).flatten()

        def matrix_pred_calc(coef_ff, data_ff):
            return np.exp(utility_calc(coef_ff, data_ff)) / (
                1 + np.exp(utility_calc(coef_ff, data_ff))).flatten()

        def classify_pred(prob_ff, threshold_ff):
            return prob_ff > threshold_ff

        if pred_data is None:
            if select_coef is None:
                self.fitted_values = self.model_result.predict(
                    self.exog_matrix)
                return self.fitted_values, classify_pred(
                    self.fitted_values, self.classification_threshold)
            else:
                t_pred = pd.Series(matrix_pred_calc(
                    coef_ff=[self.est_coef.get(key) for key in select_coef],
                    data_ff=self.exog_matrix[select_coef]),
                                   index=self.exog_matrix.index)
                return t_pred, classify_pred(t_pred,
                                             self.classification_threshold)
        else:
            if select_coef is None:
                t_pred = self.model_result.predict(pred_data[:, [
                    x for x in pred_data.columns
                    if x in list(self.est_coef.keys())
                ]])
                return t_pred, classify_pred(t_pred,
                                             self.classification_threshold)
            else:
                t_pred = pd.Series(matrix_pred_calc(
                    coef_ff=[self.est_coef.get(key) for key in select_coef],
                    data_ff=pred_data[select_coef]),
                                   index=pred_data.index)
                return t_pred, classify_pred(t_pred,
                                             self.classification_threshold)
Ejemplo n.º 25
0
def fit_model(X, y, co=0.1):
    sm = Logit((y.clip(0, 1) > co).astype(float), X.clip(0, 1), missing='drop')
    return sm.fit(disp=False)
Ejemplo n.º 26
0
def fit_model(X, y, co=0.1):
    sm = Logit((y.clip(0, 1)>co).astype(float), X.clip(0, 1), missing='drop')
    return sm.fit(disp=False)
Ejemplo n.º 27
0
# into an Nx2 array.
hw_exog = heights_weights[['Height', 'Weight']].values

# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male,
                  sm.add_constant(hw_exog, prepend=True),
                  family=sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()

# Get the coefficient parameters.
logit_pars = logit_model.fit().params

# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend=True))
logit_model2.fit().summary()

# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params

# Compare the two methods again. They give the same parameters.
DataFrame({'GLM': logit_pars, 'Logit': logit_pars2})

# Draw a separating line in the [height, weight]-space.
# The line will separate the space into predicted-male
# and predicted-female regions.

# Get the intercept and slope of the line based on the logit coefficients
intercept = -logit_pars['const'] / logit_pars['x2']
slope = -logit_pars['x1'] / logit_pars['x2']
Ejemplo n.º 28
0
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print("Done ...")

print("\n*** Recreate Train Data ***")
dfX_train = dfTrain[allCols]
y_train = dfTrain[clsVars].values
print("Done ...")

# model object
print("\n*** Model ***")
# add intercept manually
dfX_train_const = add_constant(dfX_train)
# build model and fit training data
model = Logit(y_train, dfX_train_const).fit()
# print the model summary
print(model.summary())
print("Done ...")

################################
# Classification  - Predict Train
# evaluate : Accuracy & Confusion Metrics
###############################

# Probability Distribution for train data
prob_train = model.predict(dfX_train_const)
# sort the prob dist for visualization
sorted_train = sorted(prob_train.values)
index_train = np.arange(len(sorted_train))
Ejemplo n.º 29
0

# ## Model

# ###  Regresi Logistik

# In[ ]:


titanic_ = add_constant(titanic)


# In[ ]:


model_ = Logit(titanic_['Survived'], titanic_.drop(['Survived'], axis=1))
result = model_.fit(); result.summary()


# In[ ]:


odd_ratio = np.exp(result.params); odd_ratio


# ### Ekstraksi variabel target
# Buat dataframe dengan X berupa masukan dan y berupa target (Survived)

# In[ ]:

Ejemplo n.º 30
0
    for poly in polys[target_gene]:

        in_central = poly.contains_points(
            atlas_coords.ix[:, ['X', 'Z'], time_point].T
        )
        not_expr = atlas_expr.ix[:, target_gene, time_point] < co
        in_central |= not_expr
        print(sum(in_central))
        #in_central =  (x_coord < 45)
        #in_central = x_coord_scale < 0.6

        #fitter = logistic.LogisticRegression(fit_intercept=False)
        #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co)

        sm_fitter = Logit( y.ix[in_central].clip(0, 1), X.ix[in_central].clip(0, 1))
        sm_fit = sm_fitter.fit()

        Y_tmp = atlas_expr.ix[in_central, target_gene,time_point].copy()
        Y_tmp /= Y_tmp.max()
        Y_tmp = 1.0 * (Y_tmp > .5)

        all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0
        all_regs = all_regs.index[all_regs]


        #if True:
        #if (poly == poly1) or (poly == poly2) or (poly == poly12):
        if target_gene == 'hb':
            #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const']
            #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb',  'KrP', 'const']
Ejemplo n.º 31
0
def fit_model(df,
              formula,
              title="Full",
              fp=None,
              filename="Model",
              save=False):
    """
  Function to fit model, collect stats and save predictions and model.
  df: dataframe
  formula: formula
  title: title of model (Default: "Full")
  fp: File pointer (Default: None)
  filename: Model and data file prefix ("Model")
  save: Weather to save predictions, model or both or none ["Both", "Data", "Model", False] (Default: False)
  """
    if df.shape[0] < 10:
        print "Too less instances. Skipping. Make sure you have atleast 10 instances."
        return None, None
    print "Modelling Model[%s] with instances %s" % (title, df.shape[0])
    print "Using formula:\n %s" % (formula)
    print "Generating patsy matrices"
    y, X = patsy.dmatrices(formula, df, return_type="dataframe")
    print "Initializing model"
    model = Logit(y, X)
    print "Fitting model"
    res = model.fit()
    print title, "\n", res.summary2()
    print "Confusion Matrix:", res.pred_table()
    precision = ems.precision(res.pred_table())
    recall = ems.recall(res.pred_table())
    accuracy = ems.accuracy(res.pred_table())
    f_score = ems.fscore_measure(res.pred_table())
    rmse = ems.rmse(res.predict(), model.endog)
    mae = ems.mae(res.predict(), model.endog)
    auc = ems.auc(res.predict(), model.endog)
    prc = ems.prc(res.predict(), model.endog)
    prc_filename = "%s.pdf" % filename
    plot_prc(prc, prc_filename)
    evaluation_metrics = "[Model Measures]: Confusion Matrix: %s\nRMSE: %s\tMAE: %s\tAUC: %s\nPrecision: %s\tRecall: %s\tAccuracy: %s\tF1-Score: %s\nPRC:\n%s" % (
        res.pred_table(), rmse, mae, auc, precision, recall, accuracy, f_score,
        prc_filename)
    print evaluation_metrics
    print "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename
    if fp is not None:
        print >> fp, "Modelling Model[%s] with instances %s" % (title,
                                                                df.shape[0])
        print >> fp, "Using formula:\n %s" % (formula)
        print >> fp, title, "\n", res.summary2()
        print >> fp, evaluation_metrics
        print >> fp, "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename
    model_save, data_save = False, False
    if save == "Both":
        model_save, data_save = True, True
    if save == "Model" or model_save:
        model_file = "%s.pkl" % filename
        res.save(model_file, remove_data=True)  # Save model
    if save == "Data" or data_save:
        data_file = "%s.data.txt" % filename  # Include predictions
        print "df.index", df.index
        save_data(df[["from_id", "is_self_cite"]],
                  res.predict(),
                  filename=data_file)
    print "Done Saving"
    return model, res
Ejemplo n.º 32
0
    })

    for poly in polys[target_gene]:

        in_central = poly.contains_points(atlas_coords.ix[:, ['X', 'Z'],
                                                          time_point].T)
        not_expr = atlas_expr.ix[:, target_gene, time_point] < co
        in_central |= not_expr
        print(sum(in_central))
        #in_central =  (x_coord < 45)
        #in_central = x_coord_scale < 0.6

        #fitter = logistic.LogisticRegression(fit_intercept=False)
        #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co)

        sm_fitter = Logit(y.ix[in_central].clip(0, 1),
                          X.ix[in_central].clip(0, 1))
        sm_fit = sm_fitter.fit()

        Y_tmp = atlas_expr.ix[in_central, target_gene, time_point].copy()
        Y_tmp /= Y_tmp.max()
        Y_tmp = 1.0 * (Y_tmp > .5)

        all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0
        all_regs = all_regs.index[all_regs]

        #if True:
        #if (poly == poly1) or (poly == poly2) or (poly == poly12):
        if target_gene == 'hb':
            #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const']
            #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb',  'KrP', 'const']
            #best_tfs = atlas_expr.major_axis
Ejemplo n.º 33
0
def logregress(df, X, y):
    dfX = _items(df, X)
    dfy = _value(df, y)
    model = Logit(dfy, dfX)
    result = model.fit_regularized()
    return result.summary()
Ejemplo n.º 34
0
def logregress_loose(X, y, *args, **kwargs):
    X = list(zip(*(_series(x) for x in X)))
    y = _series(y)
    model = Logit(y, X)
    result = model.fit(*args, **kwargs)
    return result.summary()
Ejemplo n.º 35
0
# Matrix of predictor variables: hieght and weight from data frame
# into an Nx2 array.
hw_exog = heights_weights[['Height', 'Weight']].values

# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()

# Get the coefficient parameters.
logit_pars = logit_model.fit().params


# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True))
logit_model2.fit().summary()

# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params

# Compare the two methods again. They give the same parameters.
DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2})

# Draw a separating line in the [height, weight]-space.
# The line will separate the space into predicted-male
# and predicted-female regions.

# Get the intercept and slope of the line based on the logit coefficients 
intercept = -logit_pars['const'] / logit_pars['x2']
slope =  -logit_pars['x1'] / logit_pars['x2']