Ejemplo n.º 1
0
    def _fit_backward(self):

        y_train = pd.Series(self._model.model.endog.copy(),
                            name=self.dependent_variable,
                            index=self._observations_idx)
        X_train = pd.DataFrame(self._model.model.exog,
                               columns=self._model.model.exog_names,
                               index=self._observations_idx)

        model = Logit(y_train, X_train, missing='drop')

        results = model.fit(**self._model_params)

        max_pvalue = results.pvalues.drop('Intercept').max()

        while max_pvalue > self.sig_level_removal:
            x_to_drop = results.pvalues.drop('Intercept').idxmax()
            X_train = X_train.drop(x_to_drop, axis=1)
            model = Logit(y_train, X_train, missing='drop')
            results = model.fit(**self._model_params)
            max_pvalue = results.pvalues.drop('Intercept').max()

        self._model = results

        return
Ejemplo n.º 2
0
def _fit_logit(X, y):
    metadata = {}
    lm = Logit(y, X)
    try:
        flm = lm.fit(method='bfgs')
        logging.info(flm.summary())
        output = format_output(flm)
        metadata = {
            'summary': str(flm.summary()),
            'summary2': str(flm.summary2())
        }
    except (np.linalg.linalg.LinAlgError, PerfectSeparationError,
            ValueError) as e:
        # Perfect separation or singular matrix - use NaN
        logging.warning(e)
        output = {
            col: {
                "coef": None,
                "std_err": None,
                "t_values": None,
                "p_values": None,
            }
            for col in X.columns
        }

    return output, metadata
Ejemplo n.º 3
0
def get_trained_logit_model():
    """
    In 'data/traning_data/' specific ETFs were visually inspected and 
    white noise (0) and not white noise (1) were assigned. This data is 
    loaded here to train the logistic parameters, but you can use this 
    functionality as a template to train your own

    :ARGS:

        :class:`NoneType`

    :RETURNS:

        a fitted :class:`statsmodels.Logit` Logistic regression that 
        has been fit to the trained data
    """
    f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx')
    data = reduce(
        lambda a, b: numpy.vstack([a, b]),
        map(lambda x: f.parse(x, index_col=0)[['ln_chg', 'Y']], f.sheet_names))
    data = pandas.DataFrame(data, columns=['ln_chg', 'Y'])

    #add an intercept for the model (required by statsmodels.api.Logit
    data['intercept'] = 1.0

    #fit the model
    logit_model = Logit(endog=data['Y'], exog=data[['intercept', 'ln_chg']])
    return logit_model.fit()
Ejemplo n.º 4
0
def model_fit(store_path,
              X_df_path,
              y_df_path,
              feature_key="Gender",
              X_cols=[],
              testing=False,
              include_prc=False):
    if testing:
        # If testing the just print X and y columns
        print store_path, X_df_path, y_df_path, feature_key, X_cols
        return feature_key, ({"llf": 0.1}, "TEMP SUMMARY")
    ## Not testing. Fit the models and return the measures
    print store_path, X_df_path, y_df_path, feature_key, X_cols
    X = pd.read_hdf(store_path, key=X_df_path, columns=X_cols)
    y = pd.read_hdf(store_path, key=y_df_path)
    print "Created dataframes, feature_key=%s" % feature_key
    print "X.shape = %s, y.shape = %s" % (X.shape, y.shape)
    model = Logit(y, X)
    res = model.fit()
    predict = res.predict()
    measures = get_all_eval_measures(predict,
                                     model.endog,
                                     include_prc=include_prc)
    measures["llf"] = res.llf
    measures["aic"] = res.aic
    measures["bic"] = res.bic
    measures["prsquared"] = res.prsquared
    measures["df_model"] = res.df_model
    return feature_key, (measures, res.summary2())
Ejemplo n.º 5
0
def run_LR(model_dir, trainSet, testSet, timestep):
    # get shape
    H, W, C = trainSet.shape[1], trainSet.shape[2], trainSet.shape[3]
    train_len, test_len = trainSet.shape[0], testSet.shape[0]

    # get XY features
    trainX, trainY = getXSYS(trainSet, timestep)
    testX, testY = getXSYS(testSet, timestep)

    print('Train set shape: X/Y', trainX.shape, trainY.shape)
    print('Test set shape: X/Y', testX.shape, testY.shape)

    # check data imbalance
    neg, pos = np.bincount(trainX.flatten())
    weight_ratio = neg / pos
    print('Weight ratio:', round(weight_ratio, 5))

    # logit
    logit_model = Logit(trainY, trainX)
    result = logit_model.fit()
    print(result.summary2())

    # LR
    logreg = LogisticRegression(
        class_weight={1: weight_ratio})  # balance pos/neg in training set
    logreg.fit(trainX, trainY)
    predY = logreg.predict(testX)

    y_true = testY.reshape((-1, H, W, C))
    y_pred = predY.reshape((-1, H, W, C))
    print('#Positive predictions: ', y_pred[y_pred != 0].shape[0], '\n')

    return y_true, y_pred
def get_trained_logit_model():
    """
    In 'data/traning_data/' specific ETFs were visually inspected and 
    white noise (0) and not white noise (1) were assigned. This data is 
    loaded here to train the logistic parameters, but you can use this 
    functionality as a template to train your own

    :ARGS:

        :class:`NoneType`

    :RETURNS:

        a fitted :class:`statsmodels.Logit` Logistic regression that 
        has been fit to the trained data
    """
    f = pandas.ExcelFile('../data/training_data/Trained Data.xlsx')
    data = reduce(lambda a, b: numpy.vstack([ a, b]), map(
        lambda x: f.parse(x, index_col = 0)[['ln_chg', 'Y']], f.sheet_names))
    data = pandas.DataFrame(data, columns = ['ln_chg', 'Y'])

    #add an intercept for the model (required by statsmodels.api.Logit
    data['intercept'] = 1.0

    #fit the model
    logit_model = Logit(endog = data['Y'], 
                        exog = data[['intercept', 'ln_chg']])
    return logit_model.fit()
Ejemplo n.º 7
0
def model_fit(y,X, X_cols, y_col, feature_key="Gender", testing=False, include_prc=False):
  if testing:
    # If testing the just print X and y columns
    print X_cols, y_col
    return feature_key, ({"llf": 0.1}, "TEMP SUMMARY")
  ## Not testing. Fit the models and return the measures
  print feature_key, X.shape, X_cols, y.shape, y_col
  X = pd.DataFrame(X,columns=X_cols)
  y = pd.Series(y,name=y_col)
  print "Created dataframes."
  model = Logit(y,X)
  res = model.fit()
  measures = get_all_eval_measures(res, model.endog, include_prc=include_prc)
  return feature_key, (measures, res.summary2())
Ejemplo n.º 8
0
def validate_data_predictors(data,
                             outcome,
                             predictors,
                             probabilities,
                             survival_time=False):
    """Validates that for each predictor column, all values are within the range 0-1

    Notes
    -----
    If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range.
    If a predictor has probability `False`, converts all values in that column with logistic regression

    Parameters
    ----------
    data : pd.DataFrame
        the data set
    outcome : str
        the column to use as 'outcome'
    predictors : list(str)
        the list of predictors for the analysis
    probabilities: list(bool)
        list marking whether a predictor is a probability
    survival_time : bool
        if the analysis is a survival time analysis
    """
    for i in range(0, len(predictors)):
        if probabilities[i]:
            #validate that any predictors with probability TRUE are b/t 0 and 1
            if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) <
                                                  0):
                raise ValueError("{val} must be between 0 and 1".format(
                    val=repr(predictors[i])))
        else:
            if survival_time:
                from statsmodels.sandbox.cox import CoxPH
                #TODO
            else:
                from statsmodels.api import Logit
                #predictor is not a probability, convert with logistic regression
                model = Logit(data[outcome], data[predictors[i]])
                data[predictors[i]] = model.fit().y_pred
    return data
Ejemplo n.º 9
0
def logit_fit(x_data, y, name='train'):
    """拟合逻辑回归,并绘制 gini,ks 曲线  \n
    参数:
    ----------
    x_data: dataframe, 已清洗好的训练数据的特征变量,函数会自动补上常数项  \n
    y: series or 1darray, 目标变量   \n
    name: 训练模型的名字  \n
    返回值:
    ----------
    result: statsmodel.api.Logit.fit() 返回结果对象  \n
    model_eval: ModelEval, 模型评估对象"""
    model_data = add_constant(x_data)
    logit_reg = Logit(y, model_data)
    result = logit_reg.fit(disp=False)

    prob = result.predict(model_data)
    model_eval = ModelEval(-prob, y, name, plot=False)

    a = "************************************"
    print(a + "  " + name + "  " + a)
    print(result.summary2())
    model_eval.giniks_plot()
    return result, model_eval
Ejemplo n.º 10
0
def validate_data_predictors(data, outcome, predictors, probabilities, survival_time=False):
    """Validates that for each predictor column, all values are within the range 0-1

    Notes
    -----
    If a predictor has probability `True`, checks that the column `data[predictor]` has all values in the appropriate range.
    If a predictor has probability `False`, converts all values in that column with logistic regression

    Parameters
    ----------
    data : pd.DataFrame
        the data set
    outcome : str
        the column to use as 'outcome'
    predictors : list(str)
        the list of predictors for the analysis
    probabilities: list(bool)
        list marking whether a predictor is a probability
    survival_time : bool
        if the analysis is a survival time analysis
    """
    for i in range(0, len(predictors)):
        if probabilities[i]:
            #validate that any predictors with probability TRUE are b/t 0 and 1
            if (max(data[predictors[i]]) > 1) or (min(data[predictors[i]]) < 0):
                raise ValueError("{val} must be between 0 and 1"
                                 .format(val=repr(predictors[i])))
        else:
            if survival_time:
                from statsmodels.sandbox.cox import CoxPH
                #TODO
            else:
                from statsmodels.api import Logit 
                #predictor is not a probability, convert with logistic regression
                model = Logit(data[outcome], data[predictors[i]])
                data[predictors[i]] = model.fit().y_pred
    return data
Ejemplo n.º 11
0
    def fit(self, X, y, print_detail=False):
        """Stepwise logistic regression. Use Score test for entry, Wald test for remove.
        参数:
        ----------
        X: array-like, n_sample * p_features. 特征变量数据集,程序会自动添加常数项
        y: array-like, 目标变量
		print_detail: bool, 是否打印出逐步回归选择变量的细节
		返回值:
		-----------
		result: 类型同 statsmodels.api.Logit 对象 fit 方法的返回值, 逐步回归选出的模型。"""

        def score_test(Xtest, y_true, y_predict):
            """对step forward进入的变量进行Score检验。函数假设新进入的变量放在最后.
            Xtest包括vars_old(似合模型并给出预测值y_predict的),和var_new(一个待检验的新变量)。
            Score检验假设待检验变量的系数为0,所以Xtest虽然包括了它的数据,但拟合参数是按没有此变量计算出来的。"""
            u = np.dot(Xtest.T, y_true - y_predict)  # 一阶导数
            h = np.dot(Xtest.T * (y_predict * (1 - y_predict)).values.reshape(len(y_predict)), Xtest)  # 二阶导数
            score = np.dot(np.dot(u.T, np.linalg.inv(h)), u)  # score 是 1*1 数组
            p_value = chi2.sf(score, 1)  # Score统计量服从自由度为1的卡方分布
            return score, p_value

        def print_wrap(*obj):
            if print_detail:
                print(*obj)

        X = add_constant(X)
        xenter = ['const']
        xwait = list(X.columns.drop('const'))
        logit_mod = Logit(y, X[xenter])
        logit_res = logit_mod.fit(disp=0)
        y_predict = logit_res.predict(X[xenter])
        step = 0
        while xwait:  # 停止条件1:所有变量都进入了模型
            # entry test
            score = pd.Series(name='Score')
            pvalue = pd.Series(name='P>chi2')
            for xname in xwait:
                tmpX = X[xenter + [xname]]
                score[xname], pvalue[xname] = score_test(tmpX, y, y_predict)

            step += 1
            print_wrap("step {}: Variables Entry test:\n".format(step),
                       pd.concat([score, pvalue], axis=1))  # 打印运行信息

            if pvalue.min() <= self.entry:  # 最显著的变量选进来
                xin = pvalue.argmin()
                xenter.append(xin)
                xwait.remove(xin)
                print_wrap("step {0}: {1} entered.\n".format(step, xin))
            else:  # 停止条件2:没有变量符合进入标准
                print_wrap("Stopped 2: No vars can get entered any more.\n")
                break

            # remove test
            while True:  # 程序运行到这里,说明新增了变量进来
                logit_mod = Logit(y, X[xenter])
                logit_res = logit_mod.fit(disp=0)
                y_predict = logit_res.predict(X[xenter])
                test = logit_res.wald_test_terms().dframe  # wald 检验
                pvalue = test['P>chi2'].iloc[1:]  # 常数项不参与检验

                step += 1
                print_wrap("step {}: Variables remove test:\n".format(step), test)

                if pvalue.max() < self.stay:
                    xout = None
                    print_wrap("step {}: No Variables removed:\n".format(step))
                    break  # 所有变量都是显著的,不剔除变量
                else:
                    xout = pvalue.argmax()
                    xenter.remove(xout)
                    xwait.append(xout)
                    print_wrap("step {0}: {1} removed.\n".format(step, xout))

            # 停止条件3:如果刚进入的变量又剔除
            if xin == xout:
                print_wrap("Stopped 3: last var entered also got removed.\n")
                break
        else:
            print_wrap("Stopped 1: all var available got entered.\n")
        return Logit(y, X[xenter]).fit(disp=0)
# =============================================================================
# 1    199032
# 0    199032
# # Now the calss is balanced
# =============================================================================

# =============================================================================
# # Model building using logistic regression after SMOTE
# =============================================================================
from statsmodels.api import Logit
import statsmodels.api as sm
Train_X = sm.add_constant(Train_X)
Test_X = sm.add_constant(Test_X)

M1 = Logit(Train_Y, Train_X)  #Model Defination
M1_Model = M1.fit()  #Model Building
M1_Model.summary()  #Model Output/Summary

# Prediction and Validation
Test_X['Test_Prob'] = M1_Model.predict(
    Test_X)  # Store probability predictions in "Text_X" df
Test_X.columns

# Classify 0 or 1 based on 0.5 cutoff
Test_X['Test_Class'] = np.where(Test_X['Test_Prob'] >= 0.5, 1, 0)
Test_X.columns
#Test_X['Test_Class'].value_counts() / len(Test_X)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(Test_X['Test_Class'], Test_Y))
#0.9914445887901876
Ejemplo n.º 13
0
# into an Nx2 array.
hw_exog = heights_weights[['Height', 'Weight']].values

# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()

# Get the coefficient parameters.
logit_pars = logit_model.fit().params


# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True))
logit_model2.fit().summary()

# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params

# Compare the two methods again. They give the same parameters.
DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2})

# Draw a separating line in the [height, weight]-space.
# The line will separate the space into predicted-male
# and predicted-female regions.

# Get the intercept and slope of the line based on the logit coefficients 
intercept = -logit_pars['const'] / logit_pars['x2']
slope =  -logit_pars['x1'] / logit_pars['x2']
Ejemplo n.º 14
0
def fit_model(X, y, co=0.1):
    sm = Logit((y.clip(0, 1) > co).astype(float), X.clip(0, 1), missing='drop')
    return sm.fit(disp=False)
Ejemplo n.º 15
0
def fit_model(X, y, co=0.1):
    sm = Logit((y.clip(0, 1)>co).astype(float), X.clip(0, 1), missing='drop')
    return sm.fit(disp=False)
Ejemplo n.º 16
0
print("params")
print(paramsnone)
print("maxscore")
print(scorenone)
print("###################################")
print("params")
print(l2params)
print("maxscore")
print(l2score)
print("###################################")
print("params")
print(l1params)
print("maxscore")
print(l1score)

from statsmodels.api import Logit, add_constant
print("Statistics with constant")
logit_modelb = Logit(y_train, add_constant(X_train))
result = logit_modelb.fit()
print("Summary")
print(result.summary2())

print(
    " ########################################################################"
)
print("Statistics without intercept")
logit_model = Logit(y_train, X_train)
result = logit_model.fit()
print("Summary")
print(result.summary2())
Test_X = full_raw_data.loc[full_raw_data['Source'] == 'Test'].drop(['Source', 'Loan_Status'], axis = 1).copy()

Test_Y = full_raw_data.loc[full_raw_data['Source'] == 'Test']
Test_Y = Test_Y['Loan_Status'].copy()
Test_Y.shape

###########################
# Model Building
###########################

# Build logistic regression model (using statsmodels package/library)
# And drop the insignificant variables

from statsmodels.api import Logit
M1 = Logit(Train_Y, Train_X) # (Dep_Var, Indep_Vars) # this is model definition
M1_Model = M1.fit() # This is model building
M1_Model.summary() # This is model output/summary

Cols_to_drop = ['Dependents_3+']
M2 = Logit(Train_Y, Train_X.drop(Cols_to_drop, axis = 1))
M2_Model = M2.fit()
M2_Model.summary()

Cols_to_drop.append('Self_Employed_Yes')

M3 = Logit(Train_Y, Train_X.drop(Cols_to_drop, axis = 1))
M3_Model = M3.fit()
M3_Model.summary()

Cols_to_drop.append('Gender_Male')
M4 = Logit(Train_Y, Train_X.drop(Cols_to_drop, axis = 1))
Ejemplo n.º 18
0
hw_exog = heights_weights[['Height', 'Weight']].values

# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male,
                  sm.add_constant(hw_exog, prepend=True),
                  family=sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()

# Get the coefficient parameters.
logit_pars = logit_model.fit().params

# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend=True))
logit_model2.fit().summary()

# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params

# Compare the two methods again. They give the same parameters.
DataFrame({'GLM': logit_pars, 'Logit': logit_pars2})

# Draw a separating line in the [height, weight]-space.
# The line will separate the space into predicted-male
# and predicted-female regions.

# Get the intercept and slope of the line based on the logit coefficients
intercept = -logit_pars['const'] / logit_pars['x2']
slope = -logit_pars['x1'] / logit_pars['x2']
Ejemplo n.º 19
0
class LogisticRegression:
    def __init__(self,
                 endog_name_f=None,
                 exog_name_f=None,
                 data_f=None,
                 add_constant_f=True,
                 scale_vars_list_f=list(),
                 interaction_name_f=list(),
                 convert_bool_dict_f=dict(),
                 convert_ord_list_f=list(),
                 cat_col_omit_dict_f=dict(),
                 hier_model_vars_dict_f=dict(),
                 hier_exog_var_names_f=list(),
                 classification_threshold_f=0.5,
                 **kwds):
        self.endog_name = endog_name_f
        self.exog_name = exog_name_f
        self.data = data_f.reindex()
        self.add_constant = add_constant_f
        self.interaction_name = interaction_name_f
        self.convert_bool_dict = convert_bool_dict_f  # convert_bool_dict_f
        self.convert_ord_list = convert_ord_list_f  # convert_ord_list_f
        self.hier_model_vars_dict = hier_model_vars_dict_f
        self.hier_exog_var_names = hier_exog_var_names_f
        self.cat_col_names = list()
        self.cat_col_omit_dict = cat_col_omit_dict_f
        self.cat_col_drop_names = list()
        self.dummy_col_omit_list = list()
        self.scale_vars_list = scale_vars_list_f
        self.classification_threshold = classification_threshold_f
        self.exog_name_model = None
        self.model_data = None
        self.model = None
        self.model_result = None
        self.est_coef = dict()
        self.exog_matrix = None
        self.endog_matrix = None
        self.fitted_values = None

        self.refresh_model_data()

    def check_for_exog_conflict(self):
        t_bool_ord = set(self.convert_bool_dict.keys()).intersection(
            set(self.convert_ord_list))
        t_cat_bool = set(self.cat_col_omit_dict.keys()).intersection(
            set(self.convert_bool_dict.keys()))
        t_cat_ord = set(self.cat_col_omit_dict.keys()).intersection(
            set(self.convert_ord_list))
        t_hier_exog = set(self.exog_name).intersection(
            set(self.hier_exog_var_names))

        if len(t_bool_ord) > 0:
            print(
                'WARNING appearing in both boolean and ordinal variable lists: %s'
                % ', '.join(t_bool_ord))
        if len(t_cat_ord) > 0:
            print(
                'WARNING appearing in both categorical and ordinal variable lists: %s, ignoring categorical'
                % ', '.join(t_cat_ord))
        if len(t_cat_bool) > 0:
            print(
                'WARNING appearing in both categorical and boolean variable lists: %s, ignoring categorical'
                % ', '.join(t_cat_bool))
        if len(t_hier_exog) > 0:
            print(
                'WARNING appearing in both exogenous and hierarchical exogenous variable lists: %s'
            )

    def convert_cat_to_dummies(self):
        # get list of exogenous variables that are categorical and need to be converted
        self.cat_col_names = [
            x for x in self.exog_name
            if ((x not in list(self.convert_bool_dict.keys())) and (
                x not in self.convert_ord_list) and (self.data[x].dtype == 'O')
                )
        ]
        prefix_sep = '_'
        [
            self.cat_col_omit_dict.update(
                {x: self.data[x].mode(dropna=True).values[0]})
            for x in self.cat_col_names
            if x not in list(self.cat_col_omit_dict.keys())
        ]
        self.cat_col_drop_names = [
            k + prefix_sep + v for k, v in self.cat_col_omit_dict.items()
        ]

        if len(self.cat_col_names) > 0:
            return pd.get_dummies(self.data[self.cat_col_names],
                                  prefix_sep=prefix_sep,
                                  columns=self.cat_col_names,
                                  dtype=bool)
        else:
            return None

    def convert_to_bool(self):
        t_df = pd.DataFrame()
        t_col_names = list()
        for k, v in self.convert_bool_dict.items():
            t_col_names.append(k + '_' + v + '_TF')
            t_df = pd.concat([t_df, self.data[k] == v], axis=1)
        t_df.columns = t_col_names
        return t_df

    def convert_to_ordinal(self):
        t_df = pd.DataFrame()
        t_col_names = list()
        for c in self.convert_ord_list:
            t_col_names.append(c + '_ORD')
            t_df = pd.concat([t_df, self.data[c].astype(int)], axis=1)
        t_df.columns = t_col_names
        return t_df

    def create_hier_vars(self):
        t_df = pd.DataFrame()
        for c in self.hier_model_vars_dict.keys():
            t_model = LogisticRegression(
                endog_name_f=self.hier_model_vars_dict[c]
                ['external_model'].endog_name,
                exog_name_f=self.hier_model_vars_dict[c]
                ['external_model'].exog_name,
                data_f=self.hier_model_vars_dict[c]['external_model'].data,
                add_constant_f=self.hier_model_vars_dict[c]
                ['external_model'].add_constant,
                scale_vars_list_f=self.hier_model_vars_dict[c]
                ['external_model'].scale_vars_list,
                convert_ord_list_f=self.hier_model_vars_dict[c]
                ['external_model'].convert_ord_list,
                convert_bool_dict_f=self.hier_model_vars_dict[c]
                ['external_model'].convert_bool_dict,
                cat_col_omit_dict_f=self.hier_model_vars_dict[c]
                ['external_model'].cat_col_omit_dict,
                interaction_name_f=self.hier_model_vars_dict[c]
                ['external_model'].interaction_name,
                classification_threshold_f=self.hier_model_vars_dict[c]
                ['classification_threshold'])  #######
            t_model.create_model_object()
            t_pred_prob, t_pred_class = self.hier_model_vars_dict[c][
                'external_model'].make_predictions(
                    pred_data=t_model.exog_matrix,
                    select_coef=self.hier_model_vars_dict[c]['select_coef'])
            t_col_names = list(t_df.columns) + [c, c + '_TF']
            t_df = pd.concat([t_df, t_pred_prob, t_pred_class], axis=1)
            t_df.columns = t_col_names
        return t_df

    def create_interactions(self):
        def create_dummy_df(data_f, v1, v2, drop_list_f):
            prefix_sep = '_'

            if (data_f[v1].dtype == bool) and (data_f[v2].dtype == bool):
                # both bool - create interaction effect directly
                t_df = pd.DataFrame(data_f[v1] & data_f[v2],
                                    columns=[v1 + ' * ' + v2 + '_INT'])
                return t_df, ({v1: None}, {v2: None})
            elif (data_f[v1].dtype != bool) and (data_f[v2].dtype != bool):
                # both cat
                v1_dummies = pd.get_dummies(data_f[v1],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                v1_omit = data_f[v1].mode(
                    dropna=True).values[0] if v1 not in list(
                        drop_list_f.keys()) else drop_list_f[v1]

                v2_dummies = pd.get_dummies(data_f[v2],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                v2_omit = data_f[v2].mode(
                    dropna=True).values[0] if v2 not in list(
                        drop_list_f.keys()) else drop_list_f[v2]
                t_df = pd.DataFrame(index=data_f.index)
                for c1 in [x for x in v1_dummies.columns if x != v1_omit]:
                    for c2 in [x for x in v2_dummies.columns if x != v2_omit]:
                        t_df = pd.concat([
                            t_df,
                            pd.DataFrame(v1_dummies[c1] & v2_dummies[c2],
                                         columns=[c1 + ' * ' + c2 + '_INT'])
                        ],
                                         axis=1)
                return t_df, ({v1: v1_omit}, {v2: v2_omit})
            else:
                # one bool
                if data_f[v1].dtype == bool:
                    vb = v1
                    vd = v2
                else:
                    vb = v2
                    vd = v1
                vd_dummies = pd.get_dummies(data_f[vd],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                vd_omit = data_f[vd].mode(
                    dropna=True).values[0] if vd not in list(
                        drop_list_f.keys()) else drop_list_f[vd]
                t_df = pd.DataFrame(index=data_f.index)
                for c in [x for x in vd_dummies.columns if x != vd_omit]:
                    t_df = pd.concat([
                        t_df,
                        pd.DataFrame(data_f[vb] & vd_dummies[c],
                                     columns=[vb + ' * ' + c + '_INT'])
                    ],
                                     axis=1)
                return t_df, ({vb: None}, {vd: None})

        t_all_data = pd.concat([
            self.data, self.model_data[np.setdiff1d(self.model_data.columns,
                                                    self.data.columns)]
        ],
                               axis=1)
        t_df = pd.DataFrame(index=self.data.index)
        t_dummy_col_omit_list = list()
        for int_act_col1, int_act_col2 in self.interaction_name:
            t_dummy, t_dummy_omit = create_dummy_df(
                data_f=t_all_data,
                v1=int_act_col1,
                v2=int_act_col2,
                drop_list_f=self.cat_col_omit_dict)  #####
            t_df = pd.concat([t_df, t_dummy], axis=1)
            t_dummy_col_omit_list.append(t_dummy_omit)
            del t_dummy, t_dummy_omit
        del int_act_col1, int_act_col2
        self.dummy_col_omit_list = t_dummy_col_omit_list

        return t_df

    def code_variables(self):
        # get new variable matrices
        if len(self.convert_bool_dict) > 0:
            df_bool_f = self.convert_to_bool()
        else:
            df_bool_f = None

        if len(self.convert_ord_list) > 0:
            df_ord_f = self.convert_to_ordinal()
        else:
            df_ord_f = None

        df_cat_f = self.convert_cat_to_dummies()

        return df_bool_f, df_ord_f, df_cat_f

    def refresh_model_data(self):
        df_bool_f, df_ord_f, df_cat_f = self.code_variables()

        self.check_for_exog_conflict()

        t_remain_exog = [
            x for x in self.exog_name
            if ((x not in list(self.convert_bool_dict.keys())) and (
                x not in list(self.convert_ord_list)) and (
                    x not in self.cat_col_names))
        ]

        if df_cat_f is not None:
            df_cat_f_dropped_omit = df_cat_f[[
                c for c in df_cat_f.columns if c not in self.cat_col_drop_names
            ]]
        else:
            df_cat_f_dropped_omit = None

        self.model_data = pd.concat([
            self.data[self.endog_name], self.data[t_remain_exog], df_bool_f,
            df_ord_f, df_cat_f_dropped_omit
        ],
                                    axis=1)

        # -------------
        # add predictions for fold based on estimation of lower model
        if len(self.hier_model_vars_dict) > 0:
            df_hier_f = self.create_hier_vars()
            self.data[df_hier_f.columns] = df_hier_f
            self.model_data[self.hier_exog_var_names] = df_hier_f[
                self.hier_exog_var_names]

        # add interaction variables
        if len(self.interaction_name) > 0:
            df_interaction_f = self.create_interactions()
            self.model_data[[x for x in df_interaction_f.columns
                             ]] = df_interaction_f

        self.exog_name_model = [
            x for x in self.model_data if x != self.endog_name
        ]

    def create_model_object(self):
        model_mat = copy.deepcopy(self.model_data)

        # convert booleans to floats explicitly
        for c in model_mat.columns:
            if model_mat[c].dtype == bool:
                model_mat[c] = model_mat[c].astype(float)

        # scale specified vars to N(0,1)
        for c in self.scale_vars_list:
            try:
                xbar = model_mat[c].mean()
                s = model_mat[c].std()
                model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s)
                del xbar, s
            except KeyError:
                print(
                    'Warning: specified variable to scale, %s, is not included in model covariates'
                    % c)

        # drop rows with na
        model_mat.dropna(inplace=True)

        # add constant if needed
        if self.add_constant:
            model_mat = pd.concat([
                pd.DataFrame(data=[1] * model_mat.shape[0],
                             index=model_mat.index,
                             columns=['const']), model_mat
            ],
                                  axis=1)

        self.endog_matrix = model_mat[self.endog_name]
        self.exog_matrix = model_mat[[
            c for c in model_mat.columns if c != self.endog_name
        ]]

        self.model = Logit(endog=self.endog_matrix, exog=self.exog_matrix)

    def estimate_model(self):
        self.refresh_model_data()
        self.create_model_object()
        self.model_result = self.model.fit()
        self.est_coef.update(
            dict(
                zip(list(self.exog_matrix.columns),
                    self.model_result._results.params)))
        self.make_predictions()  # predict values of training data
        print(self.model_result.summary())

    def make_predictions(self, pred_data=None, select_coef=None):
        def utility_calc(coef_fff, data_fff):
            return np.matmul(np.array(data_fff),
                             np.array(coef_fff).reshape(len(coef_fff),
                                                        1)).flatten()

        def matrix_pred_calc(coef_ff, data_ff):
            return np.exp(utility_calc(coef_ff, data_ff)) / (
                1 + np.exp(utility_calc(coef_ff, data_ff))).flatten()

        def classify_pred(prob_ff, threshold_ff):
            return prob_ff > threshold_ff

        if pred_data is None:
            if select_coef is None:
                self.fitted_values = self.model_result.predict(
                    self.exog_matrix)
                return self.fitted_values, classify_pred(
                    self.fitted_values, self.classification_threshold)
            else:
                t_pred = pd.Series(matrix_pred_calc(
                    coef_ff=[self.est_coef.get(key) for key in select_coef],
                    data_ff=self.exog_matrix[select_coef]),
                                   index=self.exog_matrix.index)
                return t_pred, classify_pred(t_pred,
                                             self.classification_threshold)
        else:
            if select_coef is None:
                t_pred = self.model_result.predict(pred_data[:, [
                    x for x in pred_data.columns
                    if x in list(self.est_coef.keys())
                ]])
                return t_pred, classify_pred(t_pred,
                                             self.classification_threshold)
            else:
                t_pred = pd.Series(matrix_pred_calc(
                    coef_ff=[self.est_coef.get(key) for key in select_coef],
                    data_ff=pred_data[select_coef]),
                                   index=pred_data.index)
                return t_pred, classify_pred(t_pred,
                                             self.classification_threshold)
Ejemplo n.º 20
0
    for poly in polys[target_gene]:

        in_central = poly.contains_points(
            atlas_coords.ix[:, ['X', 'Z'], time_point].T
        )
        not_expr = atlas_expr.ix[:, target_gene, time_point] < co
        in_central |= not_expr
        print(sum(in_central))
        #in_central =  (x_coord < 45)
        #in_central = x_coord_scale < 0.6

        #fitter = logistic.LogisticRegression(fit_intercept=False)
        #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co)

        sm_fitter = Logit( y.ix[in_central].clip(0, 1), X.ix[in_central].clip(0, 1))
        sm_fit = sm_fitter.fit()

        Y_tmp = atlas_expr.ix[in_central, target_gene,time_point].copy()
        Y_tmp /= Y_tmp.max()
        Y_tmp = 1.0 * (Y_tmp > .5)

        all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0
        all_regs = all_regs.index[all_regs]


        #if True:
        #if (poly == poly1) or (poly == poly2) or (poly == poly12):
        if target_gene == 'hb':
            #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const']
            #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb',  'KrP', 'const']
            #best_tfs = atlas_expr.major_axis
Ejemplo n.º 21
0
# ## Model

# ###  Regresi Logistik

# In[ ]:


titanic_ = add_constant(titanic)


# In[ ]:


model_ = Logit(titanic_['Survived'], titanic_.drop(['Survived'], axis=1))
result = model_.fit(); result.summary()


# In[ ]:


odd_ratio = np.exp(result.params); odd_ratio


# ### Ekstraksi variabel target
# Buat dataframe dengan X berupa masukan dan y berupa target (Survived)

# In[ ]:


y = titanic.Survived.copy() # copy “y” column values out
Ejemplo n.º 22
0
    for poly in polys[target_gene]:

        in_central = poly.contains_points(atlas_coords.ix[:, ['X', 'Z'],
                                                          time_point].T)
        not_expr = atlas_expr.ix[:, target_gene, time_point] < co
        in_central |= not_expr
        print(sum(in_central))
        #in_central =  (x_coord < 45)
        #in_central = x_coord_scale < 0.6

        #fitter = logistic.LogisticRegression(fit_intercept=False)
        #fitter.fit(X.ix[in_central, :], y.ix[in_central] > co)

        sm_fitter = Logit(y.ix[in_central].clip(0, 1),
                          X.ix[in_central].clip(0, 1))
        sm_fit = sm_fitter.fit()

        Y_tmp = atlas_expr.ix[in_central, target_gene, time_point].copy()
        Y_tmp /= Y_tmp.max()
        Y_tmp = 1.0 * (Y_tmp > .5)

        all_regs = atlas_expr.ix[:, all_regs, time_point].count(axis=1) > 0
        all_regs = all_regs.index[all_regs]

        #if True:
        #if (poly == poly1) or (poly == poly2) or (poly == poly12):
        if target_gene == 'hb':
            #best_tfs = ['bcdP', 'hkb', 'hkb2', 'KrP', 'bcdP2', 'const']
            #best_tfs = ['bcdP', 'bcdP2', 'gtP', 'kni', 'hkb',  'KrP', 'const']
            #best_tfs = atlas_expr.major_axis
            best_tfs = {
Ejemplo n.º 23
0
def fit_model(df,
              formula,
              title="Full",
              fp=None,
              filename="Model",
              save=False):
    """
  Function to fit model, collect stats and save predictions and model.
  df: dataframe
  formula: formula
  title: title of model (Default: "Full")
  fp: File pointer (Default: None)
  filename: Model and data file prefix ("Model")
  save: Weather to save predictions, model or both or none ["Both", "Data", "Model", False] (Default: False)
  """
    if df.shape[0] < 10:
        print "Too less instances. Skipping. Make sure you have atleast 10 instances."
        return None, None
    print "Modelling Model[%s] with instances %s" % (title, df.shape[0])
    print "Using formula:\n %s" % (formula)
    print "Generating patsy matrices"
    y, X = patsy.dmatrices(formula, df, return_type="dataframe")
    print "Initializing model"
    model = Logit(y, X)
    print "Fitting model"
    res = model.fit()
    print title, "\n", res.summary2()
    print "Confusion Matrix:", res.pred_table()
    precision = ems.precision(res.pred_table())
    recall = ems.recall(res.pred_table())
    accuracy = ems.accuracy(res.pred_table())
    f_score = ems.fscore_measure(res.pred_table())
    rmse = ems.rmse(res.predict(), model.endog)
    mae = ems.mae(res.predict(), model.endog)
    auc = ems.auc(res.predict(), model.endog)
    prc = ems.prc(res.predict(), model.endog)
    prc_filename = "%s.pdf" % filename
    plot_prc(prc, prc_filename)
    evaluation_metrics = "[Model Measures]: Confusion Matrix: %s\nRMSE: %s\tMAE: %s\tAUC: %s\nPrecision: %s\tRecall: %s\tAccuracy: %s\tF1-Score: %s\nPRC:\n%s" % (
        res.pred_table(), rmse, mae, auc, precision, recall, accuracy, f_score,
        prc_filename)
    print evaluation_metrics
    print "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename
    if fp is not None:
        print >> fp, "Modelling Model[%s] with instances %s" % (title,
                                                                df.shape[0])
        print >> fp, "Using formula:\n %s" % (formula)
        print >> fp, title, "\n", res.summary2()
        print >> fp, evaluation_metrics
        print >> fp, "[save=%s]" % save, "" if save else "Not", "Saving Model to %s" % filename
    model_save, data_save = False, False
    if save == "Both":
        model_save, data_save = True, True
    if save == "Model" or model_save:
        model_file = "%s.pkl" % filename
        res.save(model_file, remove_data=True)  # Save model
    if save == "Data" or data_save:
        data_file = "%s.data.txt" % filename  # Include predictions
        print "df.index", df.index
        save_data(df[["from_id", "is_self_cite"]],
                  res.predict(),
                  filename=data_file)
    print "Done Saving"
    return model, res
Ejemplo n.º 24
0
def logregress_loose(X, y, *args, **kwargs):
    X = list(zip(*(_series(x) for x in X)))
    y = _series(y)
    model = Logit(y, X)
    result = model.fit(*args, **kwargs)
    return result.summary()
Ejemplo n.º 25
0
class LogisticRegression:
    def __init__(self,
                 endog_name_f=None,
                 exog_name_f=None,
                 data_f=None,
                 add_constant_f=True,
                 scale_vars_list_f=list(),
                 interaction_name_f=list(),
                 convert_bool_dict_f=dict(),
                 convert_ord_list_f=list(),
                 cat_col_omit_dict_f=dict(),
                 **kwds):
        self.endog_name = endog_name_f
        self.exog_name = exog_name_f
        self.data = data_f.reindex()
        self.add_constant = add_constant_f
        self.interaction_name = interaction_name_f
        self.convert_bool_dict = convert_bool_dict_f  # convert_bool_dict_f
        self.convert_ord_list = convert_ord_list_f  # convert_ord_list_f
        self.cat_col_names = list()
        self.cat_col_omit_dict = cat_col_omit_dict_f
        self.cat_col_drop_names = list()
        self.dummy_col_omit_list = list()
        self.scale_vars_list = scale_vars_list_f
        self.exog_name_model = None
        self.model_data = None
        self.model = None
        self.model_result = None

        self.refresh_model_data()

    def check_for_exog_conflict(self):
        t_bool_ord = set(self.convert_bool_dict.keys()).intersection(
            set(self.convert_ord_list))
        t_cat_bool = set(self.cat_col_omit_dict.keys()).intersection(
            set(self.convert_bool_dict.keys()))
        t_cat_ord = set(self.cat_col_omit_dict.keys()).intersection(
            set(self.convert_ord_list))

        if len(t_bool_ord) > 0:
            print(
                'WARNING appearing in both boolean and ordinal variable lists: %s'
                % ', '.join(t_bool_ord))
        if len(t_cat_ord) > 0:
            print(
                'WARNING appearing in both categorical and ordinal variable lists: %s, ignoring categorical'
                % ', '.join(t_cat_ord))
        if len(t_cat_bool) > 0:
            print(
                'WARNING appearing in both categorical and boolean variable lists: %s, ignoring categorical'
                % ', '.join(t_cat_bool))

    def convert_cat_to_dummies(self):
        # get list of exogenous variables that are categorical and need to be converted
        self.cat_col_names = [
            x for x in self.exog_name
            if ((x not in list(self.convert_bool_dict.keys())) and (
                x not in self.convert_ord_list) and (self.data[x].dtype == 'O')
                )
        ]
        prefix_sep = '_'
        [
            self.cat_col_omit_dict.update(
                {x: self.data[x].mode(dropna=True).values[0]})
            for x in self.cat_col_names
            if x not in list(self.cat_col_omit_dict.keys())
        ]
        self.cat_col_drop_names = [
            k + prefix_sep + v for k, v in self.cat_col_omit_dict.items()
        ]

        if len(self.cat_col_names) > 0:
            return pd.get_dummies(self.data[self.cat_col_names],
                                  prefix_sep=prefix_sep,
                                  columns=self.cat_col_names,
                                  dtype=bool)
        else:
            return None

    def convert_to_bool(self):
        t_df = pd.DataFrame()
        t_col_names = list()
        for k, v in self.convert_bool_dict.items():
            t_col_names.append(k + '_' + v + '_TF')
            t_df = pd.concat([t_df, self.data[k] == v], axis=1)
        t_df.columns = t_col_names
        return t_df

    def convert_to_ordinal(self):
        t_df = pd.DataFrame()
        t_col_names = list()
        for c in self.convert_ord_list:
            t_col_names.append(c + '_ORD')
            t_df = pd.concat([t_df, self.data[c].astype(int)], axis=1)
        t_df.columns = t_col_names
        return t_df

    def create_interactions(self):
        def create_dummy_df(data_f, v1, v2, drop_list_f):
            prefix_sep = '_'

            if (data_f[v1].dtype == bool) and (data_f[v2].dtype == bool):
                # both bool - create interaction effect directly
                t_df = pd.DataFrame(data_f[v1] & data_f[v2],
                                    columns=[v1 + ' * ' + v2 + '_INT'])
                return t_df, ({v1: None}, {v2: None})
            elif (data_f[v1].dtype != bool) and (data_f[v2].dtype != bool):
                # both cat
                v1_dummies = pd.get_dummies(data_f[v1],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                v1_omit = data_f[v1].mode(
                    dropna=True).values[0] if v1 not in list(
                        drop_list_f.keys()) else drop_list_f[v1]

                v2_dummies = pd.get_dummies(data_f[v2],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                v2_omit = data_f[v2].mode(
                    dropna=True).values[0] if v2 not in list(
                        drop_list_f.keys()) else drop_list_f[v2]
                t_df = pd.DataFrame(index=data_f.index)
                for c1 in [x for x in v1_dummies.columns if x != v1_omit]:
                    for c2 in [x for x in v2_dummies.columns if x != v2_omit]:
                        t_df = pd.concat([
                            t_df,
                            pd.DataFrame(v1_dummies[c1] & v2_dummies[c2],
                                         columns=[c1 + ' * ' + c2 + '_INT'])
                        ],
                                         axis=1)
                return t_df, ({v1: v1_omit}, {v2: v2_omit})
            else:
                # one bool
                if data_f[v1].dtype == bool:
                    vb = v1
                    vd = v2
                else:
                    vb = v2
                    vd = v1
                vd_dummies = pd.get_dummies(data_f[vd],
                                            prefix_sep=prefix_sep,
                                            dtype=bool)
                vd_omit = data_f[vd].mode(
                    dropna=True).values[0] if vd not in list(
                        drop_list_f.keys()) else drop_list_f[vd]
                t_df = pd.DataFrame(index=data_f.index)
                for c in [x for x in vd_dummies.columns if x != vd_omit]:
                    t_df = pd.concat([
                        t_df,
                        pd.DataFrame(data_f[vb] & vd_dummies[c],
                                     columns=[vb + ' * ' + c + '_INT'])
                    ],
                                     axis=1)
                return t_df, ({vb: None}, {vd: None})

        t_df = pd.DataFrame(index=self.data.index)
        t_dummy_col_omit_list = list()
        for int_act_col1, int_act_col2 in self.interaction_name:
            t_dummy, t_dummy_omit = create_dummy_df(self.data, int_act_col1,
                                                    int_act_col2,
                                                    self.cat_col_omit_dict)
            t_df = pd.concat([t_df, t_dummy], axis=1)
            t_dummy_col_omit_list.append(t_dummy_omit)
            del t_dummy, t_dummy_omit
        del int_act_col1, int_act_col2
        self.dummy_col_omit_list = t_dummy_col_omit_list

        return t_df

    def code_variables(self):
        # get new variable matrices
        if len(self.convert_bool_dict) > 0:
            df_bool_f = self.convert_to_bool()
        else:
            df_bool_f = None

        if len(self.convert_ord_list) > 0:
            df_ord_f = self.convert_to_ordinal()
        else:
            df_ord_f = None

        df_cat_f = self.convert_cat_to_dummies()

        if len(self.interaction_name) > 0:
            df_interaction_f = self.create_interactions()
        else:
            df_interaction_f = None

        return df_bool_f, df_ord_f, df_cat_f, df_interaction_f

    def refresh_model_data(self):
        df_bool_f, df_ord_f, df_cat_f, df_interaction_f = self.code_variables()

        self.check_for_exog_conflict()

        t_remain_exog = [
            x for x in self.exog_name
            if ((x not in list(self.convert_bool_dict.keys())) and (
                x not in list(self.convert_ord_list)) and (
                    x not in self.cat_col_names))
        ]

        if df_cat_f is not None:
            df_cat_f_dropped_omit = df_cat_f[[
                c for c in df_cat_f.columns if c not in self.cat_col_drop_names
            ]]
        else:
            df_cat_f_dropped_omit = None

        self.model_data = pd.concat([
            self.data[self.endog_name], self.data[t_remain_exog], df_bool_f,
            df_ord_f, df_cat_f_dropped_omit, df_interaction_f
        ],
                                    axis=1)

        self.exog_name_model = [
            x for x in self.model_data if x != self.endog_name
        ]

    def create_model_object(self):
        model_mat = copy.deepcopy(self.model_data)

        # convert booleans to floats explicitly
        for c in model_mat.columns:
            if model_mat[c].dtype == bool:
                model_mat[c] = model_mat[c].astype(float)

        # scale specified vars to N(0,1)
        for c in self.scale_vars_list:
            try:
                xbar = model_mat[c].mean()
                s = model_mat[c].std()
                model_mat[c] = model_mat[c].apply(lambda x: (x - xbar) / s)
                del xbar, s
            except KeyError:
                print(
                    'Warning: specified variable to scale, %s, is not included in model covariates'
                    % c)

        # drop rows with na
        model_mat.dropna(inplace=True)

        # add constant if needed
        if self.add_constant:
            model_mat = pd.concat([
                pd.DataFrame(data=[1] * model_mat.shape[0],
                             index=model_mat.index,
                             columns=['const']), model_mat
            ],
                                  axis=1)

        self.model = Logit(endog=model_mat[self.endog_name],
                           exog=model_mat[[
                               c for c in model_mat.columns
                               if c != self.endog_name
                           ]])

    def estimate_model(self):
        self.refresh_model_data()
        self.create_model_object()
        self.model_result = self.model.fit()
        print(self.model_result.summary())