def dict_to_model_desc(dictionary): """Return a string representation of a patsy ModelDesc object""" lhs_termlist = [Term([LookupFactor(dictionary['lhs_termlist'][0])])] rhs_termlist = [] for name in dictionary['rhs_termlist']: if name == '': rhs_termlist.append(Term([])) else: rhs_termlist.append(Term([LookupFactor(name)])) return ModelDesc(lhs_termlist, rhs_termlist)
def _modeldesc_from_dict(self, d): """Return a string representation of a patsy ModelDesc object""" lhs_termlist = [Term([LookupFactor(d['lhs_termlist'][0])])] rhs_termlist = [] for name in d['rhs_termlist']: if name == '': rhs_termlist.append(Term([])) else: rhs_termlist.append(Term([LookupFactor(name)])) md = ModelDesc(lhs_termlist, rhs_termlist) return md
def _prune(self, fit, p_max): """ If the fit contains statistically insignificant parameters, remove them. Returns a pruned fit where all parameters have p-values of the t-statistic below p_max Parameters ---------- fit: fm.ols fit object Can contain insignificant parameters p_max : float Maximum allowed probability of the t-statistic Returns ------- fit: fm.ols fit object Won't contain any insignificant parameters """ model_desc = ModelDesc( fit.model.formula.lhs_termlist[:], fit.model.formula.rhs_termlist[:]) to_prune = fit.pvalues.where( fit.pvalues > p_max).dropna().index.tolist() to_prune.remove('Intercept') while to_prune: model_desc.rhs_termlist.remove(Term([LookupFactor(to_prune[0])])) fit = fm.ols(model_desc, data=self.data_frame).fit() to_prune = fit.pvalues.where( fit.pvalues > p_max).dropna().index.tolist() to_prune.remove('Intercept') return fit
def add_predictors(base_formula, extra_predictors): desc = ModelDesc.from_formula(base_formula) # Using LookupFactor here ensures that everything will work correctly even # if one of the column names in extra_columns is named like "weight.in.kg" # or "sys.exit()" or "LittleBobbyTables()". desc.rhs_termlist += [Term([LookupFactor(p)]) for p in extra_predictors] return desc
def _do_analysis_no_cross_validation(self): """ Find the best model (fit) and create self.list_of_fits and self.fit """ # first model is just the mean response_term = [Term([LookupFactor(self.y)])] model_terms = [Term([])] # empty term is the intercept all_model_terms_dict = { x: Term([LookupFactor(x)]) for x in self.list_of_x } # ...then add another term for each candidate #model_terms += [Term([LookupFactor(c)]) for c in candidates] model_desc = ModelDesc(response_term, model_terms) self._list_of_fits.append(fm.ols(model_desc, data=self.df).fit()) # try to improve the model until no improvements can be found while all_model_terms_dict: # try each x and overwrite the best_fit if we find a better one # the first best_fit is the one from the previous round ref_fit = self._list_of_fits[-1] best_fit = self._list_of_fits[-1] best_bic = best_fit.bic for x, term in all_model_terms_dict.items(): # make new_fit, compare with best found so far model_desc = ModelDesc( response_term, ref_fit.model.formula.rhs_termlist + [term]) fit = fm.ols(model_desc, data=self.df).fit() if fit.bic < best_bic: best_bic = fit.bic best_fit = fit best_x = x # Sometimes, the obtained fit may be better, but contains unsignificant parameters. # Correct the fit by removing the unsignificant parameters and estimate again best_fit = self._prune(best_fit, p_max=self.p_max) # if best_fit does not contain more variables than ref fit, exit if len(best_fit.model.formula.rhs_termlist) == len( ref_fit.model.formula.rhs_termlist): break else: self._list_of_fits.append(best_fit) all_model_terms_dict.pop(best_x) self._fit = self._list_of_fits[-1]
def build_model_desc(snps, no_interactions): """ Creates the model description (formula) :param snps: The selected snp labels :param no_interactions: If false, interactions will not be included in the model :return: The model description """ x_terms = [] for i in range(len(snps)): # Main effects snp_i = EvalFactor(snps[i]) x_terms.append(Term([snp_i])) if not no_interactions: for j in range(i + 1, len(snps)): # Interaction effects snp_j = EvalFactor(snps[j]) x_terms.append(Term([snp_i, snp_j])) return ModelDesc([], x_terms)
def create_patsy_model(dependent_variable, independent_variables, transformations={}, interactions=[]): ''' Construct and return patsy formula (object representation) ''' # 1) Handling passing in [{'name': X}] vs [X] lhs_var = dependent_variable rhs_vars = independent_variables if 'name' in dependent_variable: lhs_var = dependent_variable['name'] if 'name' in independent_variables[0]: new_rhs_vars = [] for iv in independent_variables: if type(iv) is list: new_rhs_vars.append([x['name'] for x in iv]) else: if 'name' in iv: new_rhs_vars.append(iv['name']) else: new_rhs_vars.append(iv) rhs_vars = new_rhs_vars if interactions: first_interaction = interactions[0] if 'name' in first_interaction: new_interactions = [] for interaction in interactions: new_interactions.append([term['name'] for term in interaction]) rhs_interactions = new_interactions else: rhs_interactions = interactions # 2) Constructing model lhs = [ Term([LookupFactor(lhs_var)]) ] rhs = [ Term([]) ] for rhs_var in rhs_vars: if type(rhs_var) is list: rhs += [ Term([ LookupFactor(term) for term in rhs_var ]) ] else: if rhs_var in transformations: transformation = transformations[rhs_var] if transformation == 'square': rhs += [ Term([ LookupFactor(rhs_var) ]) ] format_string = transformation_to_format_string[transformation] rhs += [ Term([ EvalFactor(format_string.format(rhs_var)) ]) ] else: rhs += [ Term([ LookupFactor(rhs_var) ]) ] if interactions: rhs += [ Term([ LookupFactor(term) for term in interaction ]) for interaction in rhs_interactions ] model = ModelDesc(lhs, rhs) return model
def _do_analysis_cross_validation(self): """ Find the best model (fit) based on cross-valiation (leave one out) """ assert len(self.data_frame) < 15, "Minimum 15 datapoints" # initialization: first model is the mean, but compute cv correctly. errors = [] response_term = [Term([LookupFactor(self.dependent_var)])] model_desc = ModelDesc(response_term, [Term([])]) for i in self.data_frame.index: # make new_fit, compute cross-validation and store error data_frame_ = self.data_frame.drop(i, axis=0) fit = fm.ols(model_desc, data=data_frame_).fit() cross_prediction = self._predict( fit=fit, data_frame=self.data_frame.loc[[i], :]) errors.append( cross_prediction['predicted'] - cross_prediction[self.dependent_var]) self._list_of_fits = [fm.ols(model_desc, data=self.data_frame).fit()] self.list_of_cverrors = [np.mean(np.abs(np.array(errors)))] # try to improve the model until no improvements can be found all_model_terms_dict = {x: Term([LookupFactor(x)]) for x in self.list_of_x} while all_model_terms_dict: # import pdb;pdb.set_trace() # try each x in all_exog and overwrite if we find a better one # at the end of iteration (and not earlier), save the best of the iteration better_model_found = False best = { "fit": self._list_of_fits[-1], "cverror": self.list_of_cverrors[-1] } for value, term in all_model_terms_dict.items(): model_desc = ModelDesc( response_term, self._list_of_fits[-1].model.formula.rhs_termlist + [term]) # cross_validation, currently only implemented for monthly data # compute the mean error for a given formula based on leave-one-out. errors = [] for i in self.data_frame.index: # make new_fit, compute cross-validation and store error data_frame_ = self.data_frame.drop(i, axis=0) fit = fm.ols(model_desc, data=data_frame_).fit() cross_prediction = self._predict( fit=fit, data_frame=self.data_frame.loc[[i], :]) errors.append( cross_prediction['predicted'] - cross_prediction[self.dependent_var]) cverror = np.mean(np.abs(np.array(errors))) # compare the model with the current fit if cverror < best['cverror']: # better model, keep it # first, reidentify using all the datapoints best['fit'] = fm.ols( model_desc, data=self.data_frame).fit() best['cverror'] = cverror better_model_found = True best_val = value if better_model_found: self._list_of_fits.append(best['fit']) self.list_of_cverrors.append(best['cverror']) else: # if we did not find a better model, exit break # next iteration with the found exog removed all_model_terms_dict.pop(best_val) self._fit = self._list_of_fits[-1]
execfile("code/03-DataPrep.py") from patsy import dmatrices, ModelDesc, Term, LookupFactor from sklearn.metrics import roc_curve from sklearn.model_selection import cross_val_score from sklearn import linear_model from sklearn.feature_selection import SelectFromModel import numpy as np ''' model 1 - logistic regression with L1 regularization ''' formula = ModelDesc([Term([LookupFactor('rating')])], [Term([LookupFactor(c)]) for c in orgfeatures]) y, x = dmatrices(formula, rawdf, return_type="dataframe") y = y.values.flatten() logreg = linear_model.LogisticRegression(C=0.1, penalty='l1', tol=0.01) logreg.fit(x, y) scores = cross_val_score(logreg, x, y, cv=5) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(logreg.coef_).flatten()}) nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist() print(len(nflist)) # feature selection using best model from cross validation and get the best features fslogreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear') fslogreg.fit(x, y)
def _group_model(spreadsheet=None, contrastdicts=None, variabledicts=None, subjects=None): rawdataframe = loadspreadsheet(spreadsheet) id_column = None for variabledict in variabledicts: if variabledict["type"] == "id": id_column = variabledict["name"] break assert id_column is not None, "Missing id column, cannot specify model" rawdataframe[id_column] = pd.Series(rawdataframe[id_column], dtype=str) if all(str(id).startswith("sub-") for id in rawdataframe[id_column]): # for bids rawdataframe[id_column] = [ str(id).replace("sub-", "") for id in rawdataframe[id_column] ] rawdataframe = rawdataframe.set_index(id_column) continuous_columns = [] categorical_columns = [] columns_in_order = [] for variabledict in variabledicts: if variabledict["type"] == "continuous": continuous_columns.append(variabledict["name"]) columns_in_order.append(variabledict["name"]) elif variabledict["type"] == "categorical": categorical_columns.append(variabledict["name"]) columns_in_order.append(variabledict["name"]) # separate continuous = rawdataframe[continuous_columns] categorical = rawdataframe[categorical_columns] # only keep subjects that are in this analysis # also sets order continuous = continuous.loc[subjects, :] categorical = categorical.loc[subjects, :] # Demean continuous for flameo continuous -= continuous.mean() # replace np.nan by 0 for demeaned_continuous file and regression models continuous = continuous.replace({np.nan: 0}) # change type first to string then to category categorical = categorical.astype(str) categorical = categorical.astype("category") # merge dataframe = categorical.join(continuous, how="outer").loc[subjects, :] # maintain order dataframe = dataframe[columns_in_order] # remove zero variance columns columns_var_gt_0 = dataframe.apply(pd.Series.nunique) > 1 dataframe = dataframe.loc[:, columns_var_gt_0] # don't need to specify lhs lhs = [] # generate rhs rhs = [Term([])] # force intercept for contrastdict in contrastdicts: if contrastdict["type"] == "infer": # for every term in the model a contrast of type infer needs to be specified rhs.append( Term([LookupFactor(name) for name in contrastdict["variable"]])) # specify patsy design matrix modelDesc = ModelDesc(lhs, rhs) dmat = dmatrix(modelDesc, dataframe, return_type="dataframe") _check_multicollinearity(dmat) # prepare lsmeans uniqueValuesForCategorical = [(0.0, ) if pd.api.types.is_numeric_dtype( dataframe[f].dtype) else dataframe[f].unique() for f in dataframe.columns] grid = pd.DataFrame(list(product(*uniqueValuesForCategorical)), columns=dataframe.columns) refDmat = dmatrix(dmat.design_info, grid, return_type="dataframe") # data frame to store contrasts contrastMats = [] for field, columnslice in dmat.design_info.term_name_slices.items(): constraint = { column: 0 for column in dmat.design_info.column_names[columnslice] } contrast = dmat.design_info.linear_constraint(constraint) assert np.all(contrast.variable_names == dmat.columns) contrastMat = pd.DataFrame(contrast.coefs, columns=dmat.columns) contrastMats.append((field, contrastMat)) for contrastdict in contrastdicts: if contrastdict["type"] == "t": (variable, ) = contrastdict["variable"] variableLevels = dataframe[variable].unique() # Generate the lsmeans matrix where there is one row for each # factor level. Each row is a contrast vector. # This contrast vector corresponds to the mean of the dependent # variable at the factor level. # For example, we would have one row that calculates the mean # for patients, and one for controls. lsmeans = pd.DataFrame(index=variableLevels, columns=dmat.columns) for level in variableLevels: lsmeans.loc[level, :] = refDmat.loc[grid[variable] == level, :].mean() valueDict = contrastdict["values"] names = [ name for name in valueDict.keys() if name in variableLevels ] values = [valueDict[name] for name in names] # If we wish to test the mean of each group against zero, # we can simply use these contrasts and be done. # To test a linear hypothesis such as patient-control=0, # which is expressed here as {"patient":1, "control":-1}, # we translate it to a contrast vector by taking the linear # combination of the lsmeans contrasts. contrastVector = lsmeans.loc[names, :].mul(values, axis=0).sum() contrastMat = pd.DataFrame([contrastVector], columns=dmat.columns) contrastMats.append((contrastdict["name"], contrastMat)) npts, nevs = dmat.shape if nevs >= npts: logger.warning("Reverting to simple intercept only design. \n" f"nevs ({nevs}) >= npts ({npts})") return ( { "intercept": [1.0] * len(subjects) }, [["mean", "T", ["intercept"], [1]]], ["mean"], ) regressors = {d: dmat[d].tolist() for d in dmat.columns} contrasts = [] contrast_names = [] for contrastName, contrastMat in contrastMats: # t contrasts if contrastMat.shape[0] == 1: contrastVec = contrastMat.squeeze() contrasts.append((contrastName, "T", list(contrastVec.keys()), list(contrastVec))) contrast_names.append(contrastName) for contrastName, contrastMat in contrastMats: # f contrasts if contrastMat.shape[0] > 1: tcontrasts = [] # an f contrast consists of multiple t contrasts for i, contrastVec in contrastMat.iterrows(): tname = f"{contrastName}_{i:d}" tcontrasts.append( (tname, "T", list(contrastVec.keys()), list(contrastVec))) contrasts.extend(tcontrasts) # add t contrasts to the model contrasts.append( (contrastName, "F", tcontrasts)) # then add the f contrast contrast_names.append( contrastName) # we only care about the f contrast return regressors, contrasts, contrast_names
chist = chist[Yhist_gpvars + ['HISTBIN', wgtvar]].groupby( Yhist_gpvars + ['HISTBIN'], as_index=False).aggregate(np.sum) return chist if __name__ == "__main__": ## Build the regression formula catvars = list(cfg.flevels.keys()) with open("fpaths.json") as fpj: FPATHS = json.load(fpj) numvar_evals = ["I(YEAR - 2000)", "INCTOT99"] catvar_evals = [ "C(" + cv + ", Treatment, levels=cfg.flevels['" + cv + "'])" for cv in catvars ] desc = ModelDesc([], [Term([EvalFactor(v)]) for v in numvar_evals]) desc.rhs_termlist += [Term([EvalFactor(v)]) for v in catvar_evals] # Interactions interact_order = 2 catvar_interact = ['SEX', 'AGECAT', 'RACE'] print("Including all order-" + str(interact_order) + " interactions of the following variables:\n\t" + ", ".join(catvar_interact + numvar_evals)) interact_evals = numvar_evals + [ catvar_evals[i] for i in [catvars.index(v) for v in catvar_interact] ] desc.rhs_termlist += [ Term([EvalFactor(v) for v in list(comb)]) for comb in combinations(interact_evals, interact_order) ] # 'implied decimals'
def scatterfit(x, y, method='pearson', adjustVars=[], labelLookup={}, plotLine=True, annotateFit=True, annotatePoints=False, returnModel=False, lc='gray', **kwargs): """Scatter plot of x vs. y with a fitted line overlaid. Expects x and y as pd.Series but will accept arrays. Prints covariate unadjusted AND adjusted rho/pvalues on the figure. Plots covariate unadjusted data. Parameters ---------- x,y : ndarrays or pd.Series method : string 'pearson' adjustVars : list labelLookup : dict plotLine : bool annotateFit : bool annotatePoints : bool returnModel : bool kwargs : additional keyword arguments Passed to the plot function for the data points. Returns ------- model : statsmodels GLM object Optionally the fitted model, depending on returnModel.""" k = kwargs.keys() if not 'mec' in k: kwargs.update({'mec': 'k'}) if not 'mfc' in k: kwargs.update({'mfc': 'k'}) if not 'ms' in k: kwargs.update({'ms': 5}) """Try to force X and Y into pandas.Series objects""" if not isinstance(x, pd.core.series.Series): x = pd.Series(x, name='X') if not isinstance(y, pd.core.series.Series): y = pd.Series(y, name='Y') xlab = x.name ylab = y.name if xlab == ylab: ylab = 'y_' + ylab xlab = 'x_' + xlab x.name = xlab y.name = ylab tmpDf = pd.concat(( x, y, ), axis=1, join='inner') for av in adjustVars: tmpDf = pd.concat((tmpDf, pd.DataFrame(av)), axis=1) """Drop any row with a nan in either column""" tmpDf = tmpDf.dropna(axis=0, how='any') plt.gca().set_xmargin(0.2) plt.gca().set_ymargin(0.2) unrho, unp = partialcorr(tmpDf[xlab], tmpDf[ylab], method=method) """Print unadjusted AND adjusted rho/pvalues Plot unadjusted data with fit though...""" if method == 'spearman' and plotLine: #unrho,unp=stats.spearmanr(tmpDf[xlab],tmpDf[ylab]) if unrho > 0: plt.plot(sorted(tmpDf[xlab]), sorted(tmpDf[ylab]), '-', color=lc) else: plt.plot(sorted(tmpDf[xlab]), sorted(tmpDf[ylab], reverse=True), '-', color=lc) elif method == 'pearson' and plotLine: #unrho,unp=stats.pearsonr(tmpDf[xlab],tmpDf[ylab]) formula_like = ModelDesc( [Term([LookupFactor(ylab)])], [Term([]), Term([LookupFactor(xlab)])]) Y, X = dmatrices(formula_like, data=tmpDf, return_type='dataframe') model = sm.GLM(Y, X, family=sm.families.Gaussian()) results = model.fit() mnmxi = np.array([tmpDf[xlab].idxmin(), tmpDf[xlab].idxmax()]) plt.plot(tmpDf[xlab][mnmxi], results.fittedvalues[mnmxi], '-', color=lc) plt.plot(tmpDf[xlab], tmpDf[ylab], 'o', **kwargs) if annotatePoints: annotationParams = dict(xytext=(0, 5), textcoords='offset points', size='medium') for x, y, lab in zip(tmpDf[xlab], tmpDf[ylab], tmpDf.index): plt.annotate(lab, xy=(x, y), **annotationParams) if annotateFit: if unp > 0.001: s = 'p = %1.3f\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0]) else: s = 'p = %1.1e\nrho = %1.2f\nn = %d' % (unp, unrho, tmpDf.shape[0]) textTL(plt.gca(), s, color='black') if len(adjustVars) > 0: rho, p = partialcorr(tmpDf[xlab], tmpDf[ylab], adjust=adjustVars, method=method) if p > 0.001: s = 'adj-p = %1.3f\nadj-rho = %1.2f\nn = %d' % (p, rho, tmpDf.shape[0]) else: s = 'adj-p = %1.1e\nadj-rho = %1.2f\nn = %d' % (p, rho, tmpDf.shape[0]) textTR(plt.gca(), s, color='red') plt.xlabel(labelLookup.get(xlab, xlab)) plt.ylabel(labelLookup.get(ylab, ylab)) if returnModel: return model
def _epoch_spans(recspan_intern_table, data_set, rerp_specs, eval_env): rerp_infos = [] rerp_names = set() spans = [] design_offset = 0 expanded_design_offset = 0 data_format = data_set.data_format for rerp_idx, rerp_spec in enumerate(rerp_specs): start_offset = data_format.ms_to_samples(rerp_spec.start_time) # Offsets are half open: [start, stop) # But, it's more intuitive for times to be closed: [start, stop] # So we interpret the user times as a closed interval, and add 1 # sample when converting to offsets. stop_offset = 1 + data_format.ms_to_samples(rerp_spec.stop_time) if start_offset >= stop_offset: raise ValueError("Epochs must be >1 sample long!") event_set = data_set.events.find(rerp_spec.event_query) # Tricky bit: the specifies a RHS-only formula, but really we have an # implicit LHS (determined by the event_query). This makes things # complicated when it comes to e.g. keeping track of which items # survived NA removal, determining the number of rows in an # intercept-only formula, etc. Really we want patsy to just treat all # this stuff the same way as it normally handles a LHS~RHS # formula. So, we convert our RHS formula into a LHS~RHS formula, # using a special LHS that represents each event by a placeholder # integer! desc = ModelDesc.from_formula(rerp_spec.formula, eval_env) if desc.lhs_termlist: raise ValueError("Formula cannot have a left-hand side") desc.lhs_termlist = [Term([_ArangeFactor(len(event_set))])] fake_lhs, design = dmatrices(desc, event_set) surviving_event_idxes = np.asarray(fake_lhs, dtype=int).ravel() design_row_idxes = np.empty(len(event_set)) design_row_idxes.fill(-1) design_row_idxes[surviving_event_idxes] = np.arange(design.shape[0]) # Now design_row_idxes[i] is -1 if event i was thrown out, and # otherwise gives the row in 'design' which refers to event 'i'. for i in xrange(len(event_set)): event = event_set[i] # -1 for non-existent design_row_idx = design_row_idxes[i] recspan = (event.recording, event.span_id) recspan_intern = recspan_intern_table[recspan] epoch_start = start_offset + event.start_idx epoch_stop = stop_offset + event.start_idx if design_row_idx == -1: design_row = None else: design_row = design[design_row_idx, :] epoch = _Epoch(epoch_start, epoch_stop - epoch_start, design_row, design_offset, expanded_design_offset, rerp_idx, []) if design_row is None: # Event thrown out due to missing predictors; this # makes its whole epoch into an artifact -- but if overlap # correction is disabled, then this artifact only affects # this epoch, not anything else. (We still want to treat # it as an artifact though so we get proper accounting at # the end.) epoch.intrinsic_artifacts.append("_MISSING_PREDICTOR") spans.append( DataSpan((recspan_intern, epoch_start), (recspan_intern, epoch_stop), epoch, None)) if rerp_spec.name in rerp_names: raise ValueError("name %r used for two different sub-analyses" % (rerp_spec.name, )) rerp_names.add(rerp_spec.name) rerp_info = { "spec": rerp_spec, "design_info": design.design_info, "start_offset": start_offset, "stop_offset": stop_offset, "design_offset": design_offset, "expanded_design_offset": expanded_design_offset, "total_epochs": len(event_set), "epochs_with_data": 0, "epochs_with_artifacts": 0, } rerp_infos.append(rerp_info) design_offset += design.shape[1] epoch_samples = stop_offset - start_offset expanded_design_offset += epoch_samples * design.shape[1] return rerp_infos, spans, design_offset, expanded_design_offset
import extensions.patsy import common from patsy import ModelDesc, Term, dmatrix datafile = os.path.join(common.data_dir, "child.iq", 'kidiq.csv') kw = { 'dtype': { 'mom_hs': 'category', 'mom_work': 'category', } } df = pd.read_csv(datafile, **kw) """ Create a ModelDesc object capable of displaying readable column names. Compare with: Xu = dmatrix(" ~ mom_hs + C(mom_work,extensions.patsy.FullRankOneHot)", data=df, return_type='dataframe') """ factors = [ extensions.patsy.LookupFactor('mom_hs'), extensions.patsy.EvalFactorRenamed( 'C(mom_work,extensions.patsy.FullRankOneHot)').set_name('mom_work') ] terms = [Term([])] + [Term([f]) for f in factors] desc = ModelDesc([], terms) X = dmatrix(desc, df, return_type='dataframe')
def redraw(self): variables = [] if self.includeallcheckBox.isChecked(): for i in range(self.interactionlistWidget.count()): variables.append(self.interactionlistWidget.item(i).text()) else: for i in range(self.selectedlistWidget.count()): variables.append(self.selectedlistWidget.item(i).text()) nX = len(variables) if nX < 1: QtWidgets.QMessageBox.critical(self,'Error',"Too few variables selected!",\ QtWidgets.QMessageBox.Ok) return () Yname = self.YcomboBox.currentText() Lc = DS.Lc[DS.Ic] Gc = DS.Gc[DS.Ic] Lcy = Lc[Gc] Lcx = Lc[-Gc] data = DS.Raw.loc[DS.Ir, DS.Ic] Y = data[Lcy] X = data[Lcx] if nX > X.shape[0]: QtWidgets.QMessageBox.critical(self,'Error',"Factors > Observation! \n Reduce factors.",\ QtWidgets.QMessageBox.Ok) return () ny = self.YcomboBox.currentIndex() Y = Y.values.astype('float') X = X.values.astype('float') Y = Y[:, ny] nr = len(Y) basey = [Term([LookupFactor(Yname)])] basex = [] for term in variables: if term == 'Intercept': basex = [INTERCEPT] variables.remove(term) for term in variables: vterm = term.split(':') term_lookup = [LookupFactor(x) for x in vterm] if len(term_lookup) > 1: if vterm[0] == vterm[1]: term_lookup = [EvalFactor(vterm[0] + ' ** 2')] basex.append(Term(term_lookup)) desc = ModelDesc(basey, basex) data = np.column_stack((X, Y)) columns = Lcx.tolist() columns.append(Yname) data = pd.DataFrame(data, columns=columns) y, mx = dmatrices(desc, data, return_type='dataframe') dism = np.linalg.inv(np.dot(mx.T.values, mx.values)) mod = OLS(y, mx) DOE.res = mod.fit() # calculation of cross-validation ypcv = list() rcv = list() bres = list() loo = LeaveOneOut() loo.get_n_splits(mx) for train_index, test_index in loo.split(mx): mx_train = mx.ix[train_index, :] mx_test = mx.ix[test_index, :] y_train = y.ix[train_index, :] y_test = y.ix[test_index, :] modcv = OLS(y_train, mx_train) rescv = modcv.fit() ypcv.append(rescv.predict(mx_test).values[0]) rcv.append(rescv.predict(mx_test).values[0] - y_test.values[0]) bres.append((rescv.params - DOE.res.params).values**2) bres = pd.DataFrame(bres) bres = bres.sum() * nr / (nr - 1) bres = np.sqrt(bres.values) tres = np.abs(DOE.res.params.values / bres) pt = 2 * t.pdf(tres, nr) fig = Figure() ax = fig.add_subplot(111) if self.coefradioButton.isChecked(): if DOE.res.params.index[0] == 'Intercept': ind = np.arange(1, len(DOE.res.params)) vcol = [] for i in ind: if (DOE.res.pvalues[i] < 0.05): vcol.append('red') else: vcol.append('blue') ax.bar(ind, DOE.res.params[1:], align='center', color=vcol) ax.set_title('Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'.\ format(DOE.res.conf_int().ix[0,0],DOE.res.params[0],DOE.res.conf_int().ix[0,1])) ax.set_xticklabels(DOE.res.params.index[1:], rotation='vertical') cmin = DOE.res.params[1:] - DOE.res.conf_int().ix[1:, 0] cmax = DOE.res.conf_int().ix[1:, 1] - DOE.res.params[1:] ax.errorbar(ind, DOE.res.params[1:], yerr=[cmin.values, cmax.values], fmt='o', ecolor='green') else: ind = np.arange(1, len(DOE.res.params) + 1) ax.bar(ind, DOE.res.params, align='center') ax.set_title('Coefficient Value : None Intercept') ax.set_xticklabels(DOE.res.params.index[0:], rotation='vertical') cmin = DOE.res.conf_int().ix[0:, 0] - DOE.res.params[0:] cmax = DOE.res.conf_int().ix[0:, 1] - DOE.res.params[0:] ax.errorbar(ind, DOE.res.params[0:], yerr=[cmin.values, cmax.values], fmt='o', ecolor='green') ax.set_xticks(ind) ax.set_xlabel('Coefficient Number (except Intercept)') ax.annotate('red bar: significance 5%', xy=(0.75, 0.95), xycoords='figure fraction', fontsize=8) elif self.coefpredradioButton.isChecked(): if DOE.res.params.index[0] == 'Intercept': ind = np.arange(1, len(DOE.res.params)) vcol = [] for i in ind: if (pt[i] < 0.05): vcol.append('red') else: vcol.append('blue') ax.bar(ind, DOE.res.params[1:], align='center', color=vcol) ax.set_title( 'Coefficient Value : Intercept {:10.4f}-{:10.4f}-{:10.4f}'. format(DOE.res.params[0] - tres[0] * bres[0] / np.sqrt(nr), DOE.res.params[0], DOE.res.params[0] + tres[0] * bres[0] / np.sqrt(nr))) ax.set_xticklabels(DOE.res.params.index[1:], rotation='vertical') ax.errorbar(ind, DOE.res.params[1:], yerr=tres[1:] * bres[1:] / np.sqrt(nr), fmt='o', ecolor='green') else: ind = np.arange(1, len(DOE.res.params) + 1) ax.bar(ind, DOE.res.params, align='center') ax.set_title('Coefficient Value : None Intercept') ax.set_xticklabels(DOE.res.params.index[0:], rotation='vertical') ax.errorbar(ind, DOE.res.params[0:], yerr=tres[0:] * bres[0:] / np.sqrt(nr), fmt='o', ecolor='green') ax.set_xticks(ind) ax.set_xlabel('Coefficient Number (except Intercept)') ax.annotate('red bar: significance 5%', xy=(0.75, 0.95), xycoords='figure fraction', fontsize=8) elif self.fitradioButton.isChecked(): yf = DOE.res.fittedvalues.tolist() resid = DOE.res.resid.tolist() ax.scatter(y, yf, color='red', alpha=0.3, marker='o') ax.set_ylabel('Fitted Values', color='red') ax.tick_params('y', colors='red') ax1 = ax.twinx() ax1.scatter(y, resid, color='blue', alpha=0.3, marker='o') ax1.set_ylabel('Residuals', color='blue') ax1.tick_params('y', colors='blue') xmin, xmax = ax.get_xlim() ax.set_ylim([xmin, xmax]) df = DOE.res.df_resid vares = np.sum(DOE.res.resid**2) / df rmsef = np.sqrt(vares) vary = np.var(y.values) evar = (1 - vares / vary) * 100 ax.set_title( 'df {:3.0f}; RMSEF {:6.2f}; Exp.Var.{:5.1f}%'.format( df, rmsef, evar)) ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red')) ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue')) ax.set_xlabel('Measured Values') if self.VcheckBox.isChecked(): Lr = DOE.res.model.data.row_labels for i, txt in enumerate(Lr): ax.annotate(str(txt), (y.ix[i], yf[i])) elif self.predradioButton.isChecked(): ax.scatter(y, ypcv, color='red', alpha=0.3, marker='o') ax.set_ylabel('CV Predicted Values', color='red') ax.tick_params('y', colors='red') ax1 = ax.twinx() ax1.scatter(y, rcv, color='blue', alpha=0.3, marker='o') ax1.set_ylabel('CV Residuals', color='blue') ax1.tick_params('y', colors='blue') xmin, xmax = ax.get_xlim() ax.set_ylim([xmin, xmax]) ax.set_xlabel('Measured Values') df = DS.Raw.shape[0] varcv = np.sum(np.array(rcv)**2) / df rmsecv = np.sqrt(varcv) vary = np.var(y.values) evar = (1 - varcv / vary) * 100 ax.set_title( 'df {:3.0f}; RMSECV {:6.2f}; Exp.Var.{:5.1f}%'.format( df, rmsecv, evar)) ax.add_line(Line2D([xmin, xmax], [xmin, xmax], color='red')) ax1.add_line(Line2D([xmin, xmax], [0, 0], color='blue')) if self.VcheckBox.isChecked(): Lr = DOE.res.model.data.row_labels for i, txt in enumerate(Lr): ax.annotate(str(txt), (y.ix[i], ypcv[i])) elif self.levradioButton.isChecked(): Ftable = surtabDlg.launch(None) if len(np.shape(Ftable)) == 0: return () if np.argmax(Ftable['X axis'].values) == np.argmax( Ftable['Y axis'].values): QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\ QtWidgets.QMessageBox.Ok) return () fig = plt.figure() ax = fig.add_subplot(111) npts = 20 xname = Ftable[(Ftable['X axis'] == True).values].index[0] yname = Ftable[(Ftable['Y axis'] == True).values].index[0] cname = Ftable[(Ftable['Constant'] == True).values].index.tolist() cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value'] zname = Yname x = np.linspace(float(Ftable['min'][xname]), float(Ftable['max'][xname]), npts) y = np.linspace(float(Ftable['min'][yname]), float(Ftable['max'][yname]), npts) px = [] py = [] for i in range(npts): for j in range(npts): px.append(x[i]) py.append(y[j]) data = pd.DataFrame({xname: px, yname: py, zname: px}) xtitle = '' for i in range(len(cname)): xtitle = xtitle + cname[i] + ' = ' + str( cvalue.values.tolist()[i]) data[cname[i]] = np.ones(npts**2) * float(cvalue[i]) my, mx = dmatrices(desc, data, return_type='dataframe') pz = np.diag(np.dot(np.dot(mx, dism), mx.T)) px = np.array(px) py = np.array(py) pz = np.array(pz) z = plt.mlab.griddata(px, py, pz, x, y, interp='linear') plt.contour(x, y, z, 15, linewidths=0.5, colors='k') plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow) plt.colorbar() ax.set_xlabel(xname) ax.set_ylabel(yname) ax.set_title(xtitle) ax.set_xlim([px.min(), px.max()]) ax.set_ylim([py.min(), py.max()]) elif self.surradioButton.isChecked(): Ftable = surtabDlg.launch(None) if len(np.shape(Ftable)) == 0: return () if np.argmax(Ftable['X axis'].values) == np.argmax( Ftable['Y axis'].values): QtWidgets.QMessageBox.critical(self,'Error',"Two variables on the same axis",\ QtWidgets.QMessageBox.Ok) return () fig = plt.figure() ax = fig.add_subplot(111) npts = 100 xname = Ftable[(Ftable['X axis'] == True).values].index[0] yname = Ftable[(Ftable['Y axis'] == True).values].index[0] cname = Ftable[(Ftable['Constant'] == True).values].index.tolist() cvalue = Ftable.loc[(Ftable['Constant'] == True).values, 'value'] zname = Yname x = np.linspace(float(Ftable['min'][xname]), float(Ftable['max'][xname]), npts) y = np.linspace(float(Ftable['min'][yname]), float(Ftable['max'][yname]), npts) px = [] py = [] for i in range(npts): for j in range(npts): px.append(x[i]) py.append(y[j]) data = pd.DataFrame({xname: px, yname: py, zname: px}) xtitle = '' for i in range(len(cname)): xtitle = xtitle + cname[i] + ' = ' + str( cvalue.values.tolist()[i]) data[cname[i]] = np.ones(npts**2) * float(cvalue[i]) my, mx = dmatrices(desc, data, return_type='dataframe') pz = DOE.res.predict(mx) px = np.array(px) py = np.array(py) pz = np.array(pz) z = plt.mlab.griddata(px, py, pz, x, y, interp='linear') plt.contour(x, y, z, 15, linewidths=0.5, colors='k') plt.contourf(x, y, z, 15, cmap=plt.cm.rainbow) plt.colorbar() ax.set_xlabel(xname) ax.set_ylabel(yname) ax.set_title(xtitle) ax.set_xlim([px.min(), px.max()]) ax.set_ylim([py.min(), py.max()]) elif self.dismradioButton.isChecked(): fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(dism) fig.colorbar(cax) ax.set_title('Trace = {:10.4f}'.format(np.trace(dism))) elif self.inflradioButton.isChecked(): mxc = preprocessing.scale(mx.values, with_mean=True, with_std=False) mxc2 = mxc**2 infl = np.sum(mxc2, axis=0) * np.diag(dism) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(infl.reshape(1, -1), cmap='gray_r') fig.colorbar(cax) ax.yaxis.grid(False) ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') ax.set_xlabel('Inlaction Factor') if self.XcheckBox.isChecked(): if self.XlineEdit.text(): ax.set_xlabel(self.XlineEdit.text()) else: ax.set_xlabel('') if self.YcheckBox.isChecked(): if self.YlineEdit.text(): ax.set_ylabel(self.YlineEdit.text()) else: ax.set_ylabel('') if self.XGcheckBox.isChecked(): ax.xaxis.grid(True) else: ax.xaxis.grid(False) if self.YGcheckBox.isChecked(): ax.yaxis.grid(True) else: ax.yaxis.grid(False) if not self.XMcheckBox.isChecked(): ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') if not self.YMcheckBox.isChecked(): ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') self.rmmpl() self.addmpl(fig)