def run_statsmodels_models(train, test, model_description): """ Run logistic regression model to predict whether a signed up driver ever actually drove. :param input_df: Data frame prepared for statsmodels regression :type input_df: pd.DataFrame :return: AUC for model generated :rtype: float """ # Run model on all observations # Use dmatrices to format data logging.info('Running model w/ description: %s' %model_description) logging.debug('Train df: \n%s' % train.describe()) logging.debug('Test df: \n%s' % test.describe()) y_train, X_train = dmatrices(model_description, data=train, return_type='dataframe', NA_action='drop') y_test, X_test = dmatrices(model_description, data=test, return_type='dataframe', NA_action='drop') # Create, fit model mod = sm.Logit(endog=y_train, exog=X_train) res = mod.fit(method='bfgs', maxiter=100) # Output model summary print train['city_name'].value_counts() print train['signup_channel'].value_counts() print res.summary() # Create, output AUC predicted = res.predict(X_test) auc = roc_auc_score(y_true=y_test, y_score=predicted) print 'AUC for 20%% holdout: %s' %auc # Return AUC for model generated return auc
def _set_XY(self): if self.regression_type == "pooled": return dmatrices(self.formula, self.data, return_type="dataframe") elif self.regression_type == "fe": idx = self.data.i Y, X = dmatrices(self.formula, self.data, return_type="dataframe") Y = fixed_effects_transform(Y, idx) X = fixed_effects_transform(X, idx) return Y, X else: raise ValueError("Regression type %s not implemented." % self.regression_type)
def _set_XY(self): if self.regression_type == 'pooled': return dmatrices(self.formula, self.data, return_type='dataframe') elif self.regression_type == 'fe': idx = self.data.i Y, X = dmatrices(self.formula, self.data, return_type='dataframe') Y = fixed_effects_transform(Y, idx) X = fixed_effects_transform(X, idx) return Y, X else: raise ValueError('Regression type %s not implemented.' % self.regression_type)
def fit_model(self, in_csv_1, in_csv_2, column_flag): logger.info(f'Run logistic regression') logger.info(f'Reading {in_csv_1}') rvs1 = pd.read_csv(in_csv_1)[column_flag].to_numpy() logger.info(f'Data length {len(rvs1)}') logger.info(f'Reading {in_csv_2}') rvs2 = pd.read_csv(in_csv_2)[column_flag].to_numpy() logger.info(f'Data length {len(rvs2)}') x = np.zeros((len(rvs1) + len(rvs2), ), dtype=float) x[:len(rvs1)] = rvs1[:] x[len(rvs1):] = rvs2[:] y = np.zeros((len(rvs1) + len(rvs2), ), dtype=int) y[:len(rvs1)] = 0 y[len(rvs1):] = 1 self.data = {'x': x, 'y': y} Y, X = dmatrices('y ~ x', self.data) self.logit_model = sm.Logit(Y, X) self.logit_result = self.logit_model.fit() data_range = np.max(x) - np.min(x) self.view_range_min = np.min(x) - 0.05 * data_range self.view_range_max = np.max(x) + 0.05 * data_range print(self.logit_result.summary()) print(f'Estimated params: {self.logit_result.params}')
def freq_pat_selection(): input_dir = os.path.join(CUR_DIR, 'result', 'feature', 'freq_pat', 'normalized', 'support40') output_dir = os.path.join(CUR_DIR, 'result', 'feature', 'freq_pat_select', 'normalized', 'support40') for file in os.listdir(input_dir): input_fp = os.path.join(input_dir, file) feature = file[:-4] #print feature df = pandas.read_csv(input_fp) for i in range(len(TRAITS)): response = TRAITS[i] #print response #y, X = dmatrices('%s ~ %s' % (response, feature), data=df) y, X = dmatrices('%s ~ Q("%s")' % (response, feature), data=df) mod = sm.OLS(y, X) res = mod.fit() if res.pvalues[1] <= 0.1: output_folder = os.path.join(output_dir, response) if not os.path.exists(output_folder): os.makedirs(output_folder) output_fp = os.path.join(output_folder, feature + '.txt') fw = open(output_fp, 'a') fw.write("#####################################################################################################\n") fw.write(response + '\n') fw.write(str(res.summary()) + '\n') fw.close()
def do_(self, *args): column_list, result = self.get_result(*args) y, X = dmatrices('%s ~ %s' % tuple(column_list), data=result, return_type='dataframe') mod = sm.OLS(y, X) res = mod.fit() return res.summary().as_html()
def logit(): fp = r"data\matrix_data\logit\wifi_features_extra.csv" df = pandas.read_csv(fp) y, X = dmatrices('extra ~ wifi_features + end_time_var + fq_home', data=df) mod = sm.Logit(y, X) res = mod.fit() print res.summary()
def stay(): input_file = 'new.xlsx' sheet = 'Sheet1' df = pd.read_excel(input_file, sheet, header=0) y, X = dmatrices('Stay ~ Age + Gender +Income+ TravelBudget+RecentVisit1+RecentVisit2+ Foreign+ EatingHabit +\ Purpose + PreferredPartner ', df, return_type='dataframe') y = np.ravel(y) return y, X
def lr(model_formula, data_df, print_mse=True): y_train, X_train = dmatrices(model_formula, data=data_df, return_type='dataframe') model = sm.OLS(y_train, X_train) result = model.fit() if print_mse: y_train_pred = result.predict(X_train) print(f'MSE = {metrics.mean_squared_error(y_train, y_train_pred)}') return result
def getXsYData(self, df, yIndex, xlist): modelelements = ' + '.join(xlist) formula = yIndex + ' ~ ' + modelelements # print(formula) y, X = dmatrices(formula, data=df, return_type='dataframe') X = X.drop('Intercept', 1) return y, X
def getSelectedData(self, df, yIndex, xlist): modelelements = ' + '.join(xlist) formula = yIndex + ' ~ ' + modelelements # print(formula) y, x = dmatrices(formula, data=df, return_type='dataframe') x = x.drop('Intercept', 1) return pd.concat([x, y], axis=1)
def fit(formula: str, data: xr.Dataset) -> RegModel: y: DesignMatrix x: DesignMatrix y, x = dmatrices(formula, data) y_tensor: tf.Tensor = tf.constant(y) x_tensor: tf.Tensor = tf.constant(x) betas: tf.Tensor = OLS.fit_exec(y_tensor, x_tensor) out: RegModel = RegModel(data, y, x, "Direct Ordinary Least Squares", betas) return out
def single_vrb(feature): input_fp = os.path.join(CUR_DIR, "result", "feature", feature + ".csv") df = pandas.read_csv(input_fp) print df for i in range(len(TRAITS)): print "#####################################################################################" print TRAITS[i] y, X = dmatrices("%s ~ %s" % (TRAITS[i], feature), data=df) mod = sm.OLS(y, X) res = mod.fit() print res.summary()
def regress(new_data=HISTORICAL_DATA): data = DataFrame.from_dict(new_data) y, X = dmatrices( 'mu ~ wordpress + roadshow + email + facebook + google + new_coffees', data=data, return_type='dataframe') model = sm.OLS(y, X) res = model.fit() return res.conf_int()
def split_data(df, model_formula, test_size, random_state=None): y, x = dmatrices(model_formula, data=df, return_type='dataframe') x = x[x.columns.difference(['Intercept'])] y = y[y.columns.difference(['Intercept'])] target = model_formula.split('~')[0].strip() x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_size, random_state=random_state, stratify=y[target]) return x_train, x_test, y_train.values.ravel(), y_test.values.ravel()
def multi_feature_single_trait(feature_names, trait): file_name = "-".join(feature_names) + ".csv" input_fp = os.path.join(CUR_DIR, "result", "feature", file_name) df = pandas.read_csv(input_fp) print df for feature in feature_names: print "#####################################################################################" print feature y, X = dmatrices("%s ~ %s" % (trait, feature), data=df) mod = sm.OLS(y, X) res = mod.fit() print res.summary()
def multi_vrb(): input_fp = os.path.join(CUR_DIR, 'data', 'matrix_data', 'all_wifi_features.csv') df = pandas.read_csv(input_fp) print df for i in range(len(LABELS)): print "#####################################################################################################" print LABELS[i] y, X = dmatrices('%s ~ edit_dist + start_time_var + end_time_var' % LABELS[i], data=df) mod = sm.OLS(y, X) res = mod.fit() print res.summary()
def multicollinearity_test(endog, exdog, data): """ The def uses VIF-factor to do multicollinearity test :param endog: The dependent variable. String :param exdog: The independent variable. String, different variables are connected by '+' :param data: The data set. DataFrame :return: vif: VIF-factor. DataFrame """ y, x = dmatrices(endog+'~'+exdog, data=data, return_type='dataframe') vif = pd.DataFrame() vif['VIF Factor'] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])] vif['features'] = x.columns return vif
def run_statsmodels_models(train, test, model_description): """ Run logistic regression model to predict whether a signed up driver ever actually drove. :param input_df: Data frame prepared for statsmodels regression :type input_df: pd.DataFrame :return: AUC for model generated :rtype: float """ # Run model on all observations # Use dmatrices to format data logging.info('Running model w/ description: %s' % model_description) logging.debug('Train df: \n%s' % train.describe()) logging.debug('Test df: \n%s' % test.describe()) y_train, X_train = dmatrices(model_description, data=train, return_type='dataframe', NA_action='drop') y_test, X_test = dmatrices(model_description, data=test, return_type='dataframe', NA_action='drop') # Create, fit model mod = sm.Logit(endog=y_train, exog=X_train) res = mod.fit(method='bfgs', maxiter=100) # Output model summary print train['city_name'].value_counts() print train['signup_channel'].value_counts() print res.summary() # Create, output AUC predicted = res.predict(X_test) auc = roc_auc_score(y_true=y_test, y_score=predicted) print 'AUC for 20%% holdout: %s' % auc # Return AUC for model generated return auc
def fit_model_intercept(self, x, y): logger.info('Run logistic regression with intercept only') self.data = {'x': x, 'y': y} Y, X = dmatrices('y ~ 1', self.data) self.logit_model_intercept = sm.Logit(Y, X) self.logit_result_intercept = self.logit_model_intercept.fit() data_range = np.max(x) - np.min(x) self.view_range_min = np.min(x) - 0.05 * data_range self.view_range_max = np.max(x) + 0.05 * data_range self.show_regression_result(self.logit_result_intercept)
def single_vrb(feature): input_fp = os.path.join(CUR_DIR, 'data', 'matrix_data', 'feature_' + feature + '.csv') df = pandas.read_csv(input_fp) print df for i in range(len(LABELS)): print "#####################################################################################################" print LABELS[i] y, X = dmatrices('%s ~ %s' % (LABELS[i], feature), data=df) mod = sm.OLS(y, X) res = mod.fit() print res.summary()
def fit_model_third(self, x, y): logger.info('Run quadratic logistic regression') self.data = {'x': x, 'y': y} Y, X = dmatrices('y ~ x + np.power(x, 2) + np.power(x, 3)', self.data) self.logit_model_third = sm.Logit(Y, X) self.logit_result_third = self.logit_model_third.fit() data_range = np.max(x) - np.min(x) self.view_range_min = np.min(x) - 0.05 * data_range self.view_range_max = np.max(x) + 0.05 * data_range self.show_regression_result(self.logit_result_third) self.run_HL_test(3)
def fit_model_linear(self, x, y): logger.info('Run first order logistic regression') # x, y = self.get_x_y_data(in_csv_1, in_csv_2, column_flag) self.data = {'x': x, 'y': y} Y, X = dmatrices('y ~ x', self.data) self.logit_model_linear = sm.Logit(Y, X) self.logit_result_linear = self.logit_model_linear.fit() data_range = np.max(x) - np.min(x) self.view_range_min = np.min(x) - 0.05 * data_range self.view_range_max = np.max(x) + 0.05 * data_range self.show_regression_result(self.logit_result_linear) self.run_HL_test(1)
def write_single_vrb_to_txt(input_fp, output_dir, feature): df = pandas.read_csv(input_fp) for i in range(len(LABELS)): #response = 'activity' response = LABELS[i] y, X = dmatrices('%s ~ Q("%s")' % (response, feature), data=df) mod = sm.OLS(y, X) res = mod.fit() if res.pvalues[1] <= 0.05: output_folder = os.path.join(output_dir, response) if not os.path.exists(output_folder): os.makedirs(output_folder) fw = open(os.path.join(output_folder, 'summary_' + feature + '.txt'), 'a') fw.write("#####################################################################################################\n") fw.write(response + '\n') sum = str(res.summary()) fw.write(sum + '\n') fw.close()
def LogReg(): n = name.get() p = int(percent.get()) b = int(backlog.get()) i = int(intern.get()) f = int(first.get()) c = int(comm.get()) input_file = 'dataSet.xlsx' sheet = 'Sheet1' df = pd.read_excel(input_file, sheet, header=0) y, X = dmatrices('Hire ~ Percentage + Backlog + Internship + First_Round + Communication_Skills', df, return_type='dataframe') y = np.ravel(y) model = LogisticRegression() model = model.fit(X, y) h = int(model.predict(np.array([1, p, b, i, f, c]).reshape(1, -1))) if h == 1: lr.set("Hire") else: lr.set("Not Hire")
def linearRegression(): printLMAndROHeader() df = pd.DataFrame(x,columns=labels) y, X = dmatrices(generateLabels(), df, return_type='matrix') y = np.ravel(y) regressionmodel = createRegressionModel(X, y) errorCount = inferErrors(regressionmodel, X, y) print 'Errors in whole data: ' + str(errorCount) xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.25, random_state=np.random) regressionModel = createRegressionModel(xTrain, yTrain) trainErrorCount = inferErrors(regressionModel, xTrain, yTrain) print 'Errors in randomized training data (3/4): ' + str(trainErrorCount) testErrorCount = inferErrors(regressionModel, xTest, yTest) print 'Errors in randomized test data (1/4): ' + str(testErrorCount) numOfTrainErrors, numOfTestErrors = tenFoldExperiment(X, y) createBoxPlot(numOfTrainErrors, numOfTestErrors) calculateOptimalRegParam(xTrain, yTrain, xTest, yTest, True)
def train(self, observed_data: pd.DataFrame, issue_times: pd.DatetimeIndex) -> None: resampled_data, unique_inverse = self.unique_data( observed_data, issue_times) y, X = dmatrices(self.formula, resampled_data) self.models = [] for quantile_level in self.quantile_levels.fractions: model = QuantileRegressor( quantile=quantile_level, max_iter=self.max_iter, ).fit(X, y.flatten()) if model.n_iter_ >= self.max_iter: print( f"Training for model {quantile_level} stopped due to iteration limit." ) else: print(f"Training for model {quantile_level} finished.") print("Iterations: ", model.n_iter_) print("Coefficients:") print(model.coef_) print("Intercept: ", model.intercept_) print("Gamma: ", model.gamma_) self.models.append(model)
def linreg(): #actual linear regression print("Model Results: ") #printing the corrected model_string model_string = [] model_string.append(dep_var) model_string.append(" ~ ") for i in range(0, len(full_model_variable_list)): model_string.append(full_model_variable_list[i]) model_string.append(" + ") model_string.pop(-1) global full_model full_model = ''.join(model_string) print(full_model) #prints model print() print( "***********************************************************************************************************" ) print() index = 0 global levels #also used in contrasting() levels = [] for i in range(len(condensed_data[0])): if c == condensed_data[0][i]: index = i for i in range(1, len(condensed_data)): if condensed_data[i][index] not in levels: levels.append(condensed_data[i][index]) for i in range(len(levels)): levels[i] = i #Beginning of the linear regression global X global y if "*" in m: #correcting the format of the model string model_string = [] model_string.append(dep_var) model_string.append(" ~ ") for i in range(0, len(full_model_variable_list)): model_string.append(full_model_variable_list[i]) model_string.append(" + ") model_string.pop(-1) for i in range(0, len(model_string)): if "*" in model_string[i]: replacement = model_string[i].split("*") model_string[i] = replacement[0] + ":" + replacement[1] #makes sure the model is in the right format. string = ''.join(model_string) y, X = dmatrices(string, df_final) else: X = df_final[ independentvariables] # gets the modified values of the independent variables y = df_final[ dep_var] # gets the modified values of the dependent variable if not c: #The linear regression regressor = LinearRegression() regressor.fit(X, y) regression = regressor.fit(X, y) #Data about the linear regression, starting without contrast X2 = sm.add_constant(X) statistics = sm.OLS(y, X2) finalstats = statistics.fit() print(finalstats.summary()) if (o is not None): # concatenate data frames """f = open(o,"a") f.write(full_model) f.write("\n*************************************************************************************\n") f.write(finalstats.summary()) f.close()""" sys.stdout = open(o, "a") print(full_model) print( "\n*************************************************************************************\n" ) print(finalstats.summary()) sys.stdout.close() return finalstats
from math import sqrt from patsy.highlevel import dmatrices import pandas as pd df = pd.read_csv('winequality-white.csv') formula = "quality ~ density + pH + alcohol + sulphates + chlorides + Q('residual sugar') + Q('fixed acidity')" \ " + Q('citric acid')" \ " + Q('volatile acidity')" \ " + Q('free sulfur dioxide')" y, x = dmatrices(formula, data=df, return_type='dataframe') z = x.join(y) z = z.drop('Intercept', axis=1) z = (z-z.min())/(z.max()-z.min()) def predict(row, columns, coefficients): yhat = coefficients[0] for i in range(len(columns)): yhat += coefficients[i + 1] * row[columns[i]] return yhat def predict_test(row, target, columns, coefficients): p = predict(row, columns, coefficients) e = row[target] d = p - e return p, e, d def print_prediction(r): p, e, d = r
lambda s: s.str.replace( ',', '')).astype(float) # Read the results into the local authority dataframe df_las = df_las.join(results_2014.loc[:, ('Yes', 'No')], how='left') # Write the form of the regression for the Yes vote and the No vote - i.e. Yes ~ Q("All people") + Q("16 to 19") + etc expr = 'Q("' + ('") + Q("').join( list(df_las.columns[~df_las.columns.isin(['Yes', 'No'])])) + '")' yes_expr = 'Yes ~ ' + expr no_expr = 'No ~ ' + expr # Run the regression y_train_yes, X_train_yes = dmatrices(yes_expr, df_las, return_type='dataframe') poisson_training_results_yes = sm.GLM(y_train_yes, X_train_yes, family=sm.families.Poisson()).fit() y_train_no, X_train_no = dmatrices(no_expr, df_las, return_type='dataframe') poisson_training_results_no = sm.GLM(y_train_no, X_train_no, family=sm.families.Poisson()).fit() # Evaluate the regression print(poisson_training_results_yes.summary()) print(poisson_training_results_no.summary()) # Then use the model to predict results for Intersections
def from_formula(cls, formula, data, *, sigma=None, weights=None): """ Parameters ---------- formula : {str, dict-like} Either a string or a dictionary of strings where each value in the dictionary represents a single equation. See Notes for a description of the accepted syntax data : DataFrame Frame containing named variables sigma : array-like Pre-specified residual covariance to use in GLS estimation. If not provided, FGLS is implemented based on an estimate of sigma. weights : dict-like Dictionary like object (e.g. a DataFrame) containing variable weights. Each entry must have the same number of observations as data. If an equation label is not a key weights, the weights will be set to unity Returns ------- model : SUR Model instance Notes ----- Models can be specified in one of two ways. The first uses curly braces to encapsulate equations. The second uses a dictionary where each key is an equation name. Examples -------- The simplest format uses standard Patsy formulas for each equation in a dictionary. Best practice is to use an Ordered Dictionary >>> import pandas as pd >>> import numpy as np >>> data = pd.DataFrame(np.random.randn(500, 4), columns=['y1', 'x1_1', 'y2', 'x2_1']) >>> from linearmodels.system import SUR >>> formula = {'eq1': 'y1 ~ 1 + x1_1', 'eq2': 'y2 ~ 1 + x2_1'} >>> mod = SUR.from_formula(formula, data) The second format uses curly braces {} to surround distinct equations >>> formula = '{y1 ~ 1 + x1_1} {y2 ~ 1 + x2_1}' >>> mod = SUR.from_formula(formula, data) It is also possible to include equation labels when using curly braces >>> formula = '{eq1: y1 ~ 1 + x1_1} {eq2: y2 ~ 1 + x2_1}' >>> mod = SUR.from_formula(formula, data) """ na_action = NAAction(on_NA='raise', NA_types=[]) if not isinstance(formula, (Mapping, str)): raise TypeError('formula must be a string or dictionary-like') missing_weight_keys = [] eqns = OrderedDict() if isinstance(formula, Mapping): for key in formula: f = formula[key] f = '~ 0 +'.join(f.split('~')) dep, exog = dmatrices(f, data, return_type='dataframe', NA_action=na_action) eqns[key] = {'dependent': dep, 'exog': exog} if weights is not None: if key in weights: eqns[key]['weights'] = weights[key] else: missing_weight_keys.append(key) _missing_weights(missing_weight_keys) return SUR(eqns, sigma=sigma) formula = formula.replace('\n', ' ').strip() parts = formula.split('}') for i, part in enumerate(parts): base_key = None part = part.strip() if part == '': continue part = part.replace('{', '') if ':' in part.split('~')[0]: base_key, part = part.split(':') key = base_key = base_key.strip() part = part.strip() f = '~ 0 +'.join(part.split('~')) dep, exog = dmatrices(f, data, return_type='dataframe', NA_action=na_action) if base_key is None: base_key = key = f.split('~')[0].strip() count = 0 while key in eqns: key = base_key + '.{0}'.format(count) count += 1 eqns[key] = {'dependent': dep, 'exog': exog} if weights is not None: if key in weights: eqns[key]['weights'] = weights[key] else: missing_weight_keys.append(key) _missing_weights(missing_weight_keys) return SUR(eqns, sigma=sigma)
def parse_formula(formula, data): na_action = NAAction(on_NA='raise', NA_types=[]) if formula.count('~') == 1: dep, exog = dmatrices(formula, data, return_type='dataframe', NA_action=na_action) endog = instr = None return dep, exog, endog, instr elif formula.count('~') > 2: raise ValueError('formula not understood. Must have 1 or 2 ' 'occurrences of ~') blocks = [bl.strip() for bl in formula.strip().split('~')] if '[' not in blocks[1] or ']' not in blocks[2]: raise ValueError('formula not understood. Endogenous variables and ' 'instruments must be segregated in a block that ' 'starts with [ and ends with ].') dep = blocks[0].strip() exog, endog = [bl.strip() for bl in blocks[1].split('[')] instr, exog2 = [bl.strip() for bl in blocks[2].split(']')] if endog[0] == '+' or endog[1] == '+': raise ValueError( 'endogenous block must not start or end with +. This block was: {0}' .format(endog)) if instr[0] == '+' or instr[1] == '+': raise ValueError( 'instrument block must not start or end with +. This block was: {0}' .format(instr)) if exog2: exog += exog2 exog = exog[:-1].strip() if exog[-1] == '+' else exog try: dep = dmatrix('0 + ' + dep, data, eval_env=2, return_type='dataframe', NA_action=na_action) exog = dmatrix('0 + ' + exog, data, eval_env=2, return_type='dataframe', NA_action=na_action) endog = dmatrix('0 + ' + endog, data, eval_env=2, return_type='dataframe', NA_action=na_action) instr = dmatrix('0 + ' + instr, data, eval_env=2, return_type='dataframe', NA_action=na_action) except Exception as e: raise type(e)(PARSING_ERROR.format(dep, exog, endog, instr) + e.msg, e.args[1]) return dep, exog, endog, instr